{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.997849462365592, "eval_steps": 500, "global_step": 2784, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.022431484925773e-09, "advantages/std": 0.6185691356658936, "advantages/var": 0.3826277755984506, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.002867383512544803, "grad_norm": 0.12072640104660734, "learning_rate": 2e-06, "loss": 0.0, "num_tokens": 87679.0, "reward": 0.796875, "reward_std": 0.18543373048305511, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7815011540266774e-09, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.005734767025089606, "grad_norm": 0.11813835707013225, "learning_rate": 1.999999366948742e-06, "loss": -0.0, "num_tokens": 170424.0, "reward": 0.5703125, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.5703125, "rewards/drgrpo_math_reward/std": 0.4969765841960907, "step": 2 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.979305421724224e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 0.008602150537634409, "grad_norm": 0.08307332827566753, "learning_rate": 1.9999974677957702e-06, "loss": 0.0, "num_tokens": 263226.0, "reward": 0.6328125, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 3 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.011469534050179211, "grad_norm": 0.07731912706360194, "learning_rate": 1.9999943025434887e-06, "loss": -0.0, "num_tokens": 344864.0, "reward": 0.7890625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 4 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 7.31812171195528e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "epoch": 0.014336917562724014, "grad_norm": 0.09771629934875779, "learning_rate": 1.9999898711959057e-06, "loss": -0.0, "num_tokens": 453394.0, "reward": 0.453125, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.453125, "rewards/drgrpo_math_reward/std": 0.4997538626194, "step": 5 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.2583933094613625e-09, "advantages/std": 0.6185742020606995, "advantages/var": 0.38263404345503105, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 0.017204301075268817, "grad_norm": 0.1342631914269599, "learning_rate": 1.999984173758631e-06, "loss": 0.0, "num_tokens": 545520.0, "reward": 0.703125, "reward_std": 0.19438527524471283, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 6 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.02007168458781362, "grad_norm": 0.12138951673049583, "learning_rate": 1.999977210238878e-06, "loss": 0.0, "num_tokens": 634042.0, "reward": 0.640625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 7 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.4083979969389336e-09, "advantages/std": 0.6612637639045715, "advantages/var": 0.4372697654532409, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.022939068100358423, "grad_norm": 0.23519275858410402, "learning_rate": 1.999968980645464e-06, "loss": -0.0, "num_tokens": 720467.0, "reward": 0.8515625, "reward_std": 0.1938612163066864, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 8 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 9.786317257174392e-09, "advantages/std": 0.6185775995254517, "advantages/var": 0.38263824663467005, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 0.025806451612903226, "grad_norm": 0.0988104248714376, "learning_rate": 1.9999594849888084e-06, "loss": 0.0, "num_tokens": 823071.0, "reward": 0.7265625, "reward_std": 0.19674429297447205, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 9 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.5995914574054795e-09, "advantages/std": 0.4049588143825531, "advantages/var": 0.1639916413461231, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 0.02867383512544803, "grad_norm": 0.07287546496174245, "learning_rate": 1.9999487232809332e-06, "loss": -0.0, "num_tokens": 919261.0, "reward": 0.609375, "reward_std": 0.0867956355214119, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.4898075461387634, "step": 10 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9875209889720355e-09, "advantages/std": 0.4676063656806946, "advantages/var": 0.21865571322510746, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.031541218637992835, "grad_norm": 0.07623329608908455, "learning_rate": 1.9999366955354637e-06, "loss": -0.0, "num_tokens": 1004963.0, "reward": 0.828125, "reward_std": 0.11913755536079407, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 11 }, { "advantages/mean": 6.05359673500061e-09, "advantages/snr": 9.786512459149173e-09, "advantages/std": 0.6185652613639832, "advantages/var": 0.3826229825662928, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.034408602150537634, "grad_norm": 0.1237469953585538, "learning_rate": 1.999923401767629e-06, "loss": -0.0, "num_tokens": 1090102.0, "reward": 0.71875, "reward_std": 0.1820138692855835, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 12 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.1266932113463096e-08, "advantages/std": 0.6612785458564758, "advantages/var": 0.4372893152100552, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.03727598566308244, "grad_norm": 0.1434758279295005, "learning_rate": 1.9999088419942594e-06, "loss": 0.0, "num_tokens": 1185270.0, "reward": 0.7734375, "reward_std": 0.21436068415641785, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 13 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917033813576203e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 0.04014336917562724, "grad_norm": 0.12193245342510368, "learning_rate": 1.99989301623379e-06, "loss": -0.0, "num_tokens": 1278859.0, "reward": 0.6875, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 14 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262492693233955e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.043010752688172046, "grad_norm": 0.13613144621465645, "learning_rate": 1.999875924506258e-06, "loss": 0.0, "num_tokens": 1378423.0, "reward": 0.625, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 15 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962749759103603e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.045878136200716846, "grad_norm": 0.052480010681470804, "learning_rate": 1.999857566833302e-06, "loss": 0.0, "num_tokens": 1466568.0, "reward": 0.7421875, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 16 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.528086507037825e-10, "advantages/std": 0.6185652613639832, "advantages/var": 0.3826229825662928, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 0.04874551971326165, "grad_norm": 0.08619603417965296, "learning_rate": 1.9998379432381658e-06, "loss": -0.0, "num_tokens": 1561456.0, "reward": 0.828125, "reward_std": 0.1820138692855835, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 17 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.2351708626582845e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.05161290322580645, "grad_norm": 0.07287898456293838, "learning_rate": 1.999817053745694e-06, "loss": 0.0, "num_tokens": 1650097.0, "reward": 0.75, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 18 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6261990006727604e-09, "advantages/std": 0.5726990103721619, "advantages/var": 0.32798415648125356, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 0.05448028673835126, "grad_norm": 0.21959986144796612, "learning_rate": 1.999794898382336e-06, "loss": -0.0, "num_tokens": 1757617.0, "reward": 0.3984375, "reward_std": 0.17806214094161987, "rewards/drgrpo_math_reward/mean": 0.3984375, "rewards/drgrpo_math_reward/std": 0.4915000796318054, "step": 19 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.0224895169745475e-09, "advantages/std": 0.618563175201416, "advantages/var": 0.3826204017152577, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 0.05734767025089606, "grad_norm": 0.10157356395625096, "learning_rate": 1.999771477176142e-06, "loss": 0.0, "num_tokens": 1850850.0, "reward": 0.6328125, "reward_std": 0.17859892547130585, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 20 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.060215053763440864, "grad_norm": 0.036696350533152414, "learning_rate": 1.9997467901567657e-06, "loss": -0.0, "num_tokens": 1928908.0, "reward": 0.8046875, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 21 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016648251989223e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 0.06308243727598567, "grad_norm": 0.11866636461690809, "learning_rate": 1.9997208373554635e-06, "loss": 0.0, "num_tokens": 2015022.0, "reward": 0.71875, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 22 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125846186210722e-09, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.06594982078853047, "grad_norm": 4.854693100648363, "learning_rate": 1.9996936188050943e-06, "loss": 0.0, "num_tokens": 2093001.0, "reward": 0.78125, "reward_std": 0.13098980486392975, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 23 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 2.816728712284573e-09, "advantages/std": 0.661279559135437, "advantages/var": 0.43729065533035794, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.421875, "epoch": 0.06881720430107527, "grad_norm": 0.11279722346323875, "learning_rate": 1.9996651345401195e-06, "loss": 0.0, "num_tokens": 2194124.0, "reward": 0.5234375, "reward_std": 0.21648234128952026, "rewards/drgrpo_math_reward/mean": 0.5234375, "rewards/drgrpo_math_reward/std": 0.5014128684997559, "step": 24 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814329083993624e-09, "advantages/std": 0.5227940678596497, "advantages/var": 0.27331363738923997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 0.07168458781362007, "grad_norm": 0.12123531756465056, "learning_rate": 1.999635384596603e-06, "loss": 0.0, "num_tokens": 2289186.0, "reward": 0.5234375, "reward_std": 0.1433563083410263, "rewards/drgrpo_math_reward/mean": 0.5234375, "rewards/drgrpo_math_reward/std": 0.5014128684997559, "step": 25 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016467281190481e-09, "advantages/std": 0.5227928161621094, "advantages/var": 0.2733123286307091, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.671875, "epoch": 0.07455197132616488, "grad_norm": 0.09411124196600001, "learning_rate": 1.9996043690122116e-06, "loss": 0.0, "num_tokens": 2376485.0, "reward": 0.7734375, "reward_std": 0.1412346363067627, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 26 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6185722351074219, "advantages/var": 0.3826316100457916, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 0.07741935483870968, "grad_norm": 0.11120568828847453, "learning_rate": 1.999572087826214e-06, "loss": 0.0, "num_tokens": 2466063.0, "reward": 0.7421875, "reward_std": 0.19097033143043518, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 27 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 0.08028673835125448, "grad_norm": 0.09857405192647845, "learning_rate": 1.9995385410794814e-06, "loss": -0.0, "num_tokens": 2544984.0, "reward": 0.703125, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 28 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 9.78663788288456e-09, "advantages/std": 0.618557333946228, "advantages/var": 0.38261317537866546, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 0.08315412186379928, "grad_norm": 0.13390586710223532, "learning_rate": 1.999503728814488e-06, "loss": 0.0, "num_tokens": 2650054.0, "reward": 0.59375, "reward_std": 0.17176413536071777, "rewards/drgrpo_math_reward/mean": 0.59375, "rewards/drgrpo_math_reward/std": 0.4930621087551117, "step": 29 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4676070809364319, "advantages/var": 0.21865638214189076, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 0.08602150537634409, "grad_norm": 0.07979029691529577, "learning_rate": 1.9994676510753086e-06, "loss": 0.0, "num_tokens": 2748901.0, "reward": 0.7109375, "reward_std": 0.12019839137792587, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 30 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.528111170810656e-10, "advantages/std": 0.6185632348060608, "advantages/var": 0.3826204754537379, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 0.08888888888888889, "grad_norm": 0.10099304220854324, "learning_rate": 1.9994303079076223e-06, "loss": -0.0, "num_tokens": 2850862.0, "reward": 0.5234375, "reward_std": 0.17859891057014465, "rewards/drgrpo_math_reward/mean": 0.5234375, "rewards/drgrpo_math_reward/std": 0.5014128684997559, "step": 31 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2524139108748537e-09, "advantages/std": 0.5726962089538574, "advantages/var": 0.3279809477501203, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.09175627240143369, "grad_norm": 0.09555584445187362, "learning_rate": 1.999391699358709e-06, "loss": 0.0, "num_tokens": 2946035.0, "reward": 0.7265625, "reward_std": 0.17358636856079102, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 32 }, { "advantages/mean": 5.122274160385132e-09, "advantages/snr": 9.797729076606565e-09, "advantages/std": 0.5228021740913391, "advantages/var": 0.27332211323463085, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.09462365591397849, "grad_norm": 0.095240391953869, "learning_rate": 1.9993518254774516e-06, "loss": -0.0, "num_tokens": 3030677.0, "reward": 0.7265625, "reward_std": 0.15148437023162842, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 33 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016611691573306e-09, "advantages/std": 0.5227833986282349, "advantages/var": 0.2733024818812879, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.0974910394265233, "grad_norm": 0.09573900444961869, "learning_rate": 1.9993106863143336e-06, "loss": 0.0, "num_tokens": 3114495.0, "reward": 0.7578125, "reward_std": 0.13098490238189697, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 34 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.4251693997307513e-08, "advantages/std": 0.5227856040000916, "advantages/var": 0.27330478774974054, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.1003584229390681, "grad_norm": 0.09358643008271235, "learning_rate": 1.9992682819214415e-06, "loss": 0.0, "num_tokens": 3196979.0, "reward": 0.8125, "reward_std": 0.13098981976509094, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 35 }, { "advantages/mean": 6.05359673500061e-09, "advantages/snr": 1.1579395756169161e-08, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 0.1032258064516129, "grad_norm": 0.16735976169723088, "learning_rate": 1.9992246123524646e-06, "loss": -0.0, "num_tokens": 3284771.0, "reward": 0.625, "reward_std": 0.13781969249248505, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 36 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962665216109293e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.1060931899641577, "grad_norm": 0.12078546374354511, "learning_rate": 1.999179677662692e-06, "loss": -0.0, "num_tokens": 3367767.0, "reward": 0.765625, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 37 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907268126346096e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.10896057347670252, "grad_norm": 0.10079542415136743, "learning_rate": 1.999133477909016e-06, "loss": 0.0, "num_tokens": 3457120.0, "reward": 0.7265625, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 38 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.269660554926477e-09, "advantages/std": 0.6185652613639832, "advantages/var": 0.3826229825662928, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.11182795698924732, "grad_norm": 6.532784934364094, "learning_rate": 1.9990860131499304e-06, "loss": -0.0, "num_tokens": 3543851.0, "reward": 0.6875, "reward_std": 0.1820138692855835, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 39 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.344354173221399e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.11469534050179211, "grad_norm": 0.09757495386632052, "learning_rate": 1.9990372834455305e-06, "loss": 0.0, "num_tokens": 3627069.0, "reward": 0.703125, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 40 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.2524670566012098e-09, "advantages/std": 0.5726868510246277, "advantages/var": 0.3279702293365041, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.11756272401433691, "grad_norm": 0.09730325289294997, "learning_rate": 1.9989872888575127e-06, "loss": -0.0, "num_tokens": 3711073.0, "reward": 0.671875, "reward_std": 0.16545338928699493, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 41 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917148060443483e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.12043010752688173, "grad_norm": 0.0874621365795184, "learning_rate": 1.9989360294491754e-06, "loss": 0.0, "num_tokens": 3797362.0, "reward": 0.7578125, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 42 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 2.8166759048326044e-09, "advantages/std": 0.6612919569015503, "advantages/var": 0.43730705226268185, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.515625, "epoch": 0.12329749103942653, "grad_norm": 0.1518769862850437, "learning_rate": 1.9988835052854186e-06, "loss": -0.0, "num_tokens": 3912127.0, "reward": 0.4140625, "reward_std": 0.23250606656074524, "rewards/drgrpo_math_reward/mean": 0.4140625, "rewards/drgrpo_math_reward/std": 0.49449479579925537, "step": 43 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.065538968228456e-09, "advantages/std": 0.5726931691169739, "advantages/var": 0.32797746595324284, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "epoch": 0.12616487455197134, "grad_norm": 0.09380987299728875, "learning_rate": 1.9988297164327424e-06, "loss": -0.0, "num_tokens": 4006775.0, "reward": 0.6015625, "reward_std": 0.172288179397583, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4915000796318054, "step": 44 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.979341600636731e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.12903225806451613, "grad_norm": 0.12294840293247179, "learning_rate": 1.9987746629592504e-06, "loss": -0.0, "num_tokens": 4100127.0, "reward": 0.6640625, "reward_std": 0.1054728701710701, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 45 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.13189964157706094, "grad_norm": 0.10521562596882879, "learning_rate": 1.9987183449346446e-06, "loss": -0.0, "num_tokens": 4177228.0, "reward": 0.8046875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 46 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 2.6555842735419153e-09, "advantages/std": 0.7014068961143494, "advantages/var": 0.4919716339167657, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "epoch": 0.13476702508960572, "grad_norm": 0.14721937323309717, "learning_rate": 1.9986607624302303e-06, "loss": 0.0, "num_tokens": 4285112.0, "reward": 0.625, "reward_std": 0.26249876618385315, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 47 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 6.899540529955257e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.13763440860215054, "grad_norm": 0.08261311695650624, "learning_rate": 1.9986019155189124e-06, "loss": 0.0, "num_tokens": 4385910.0, "reward": 0.7734375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 48 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.2196811125523014e-08, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.14050179211469535, "grad_norm": 0.11604750525829459, "learning_rate": 1.9985418042751972e-06, "loss": -0.0, "num_tokens": 4474601.0, "reward": 0.765625, "reward_std": 0.16097761690616608, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 49 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 5.633320329065741e-09, "advantages/std": 0.6612956523895264, "advantages/var": 0.4373119398692893, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 0.14336917562724014, "grad_norm": 0.09818812063392984, "learning_rate": 1.9984804287751916e-06, "loss": 0.0, "num_tokens": 4573799.0, "reward": 0.5859375, "reward_std": 0.23592591285705566, "rewards/drgrpo_math_reward/mean": 0.5859375, "rewards/drgrpo_math_reward/std": 0.49449479579925537, "step": 50 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.979341600636731e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.14623655913978495, "grad_norm": 0.102057115147288, "learning_rate": 1.9984177890966035e-06, "loss": 0.0, "num_tokens": 4654716.0, "reward": 0.5859375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.5859375, "rewards/drgrpo_math_reward/std": 0.49449479579925537, "step": 51 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.4535487590567085e-09, "advantages/std": 0.5227980017662048, "advantages/var": 0.2733177506507367, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.14910394265232976, "grad_norm": 0.11635955339733696, "learning_rate": 1.998353885318741e-06, "loss": -0.0, "num_tokens": 4738961.0, "reward": 0.65625, "reward_std": 0.14571532607078552, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 52 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.011143494991813e-09, "advantages/std": 0.6185839772224426, "advantages/var": 0.3826461368763354, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.609375, "epoch": 0.15197132616487455, "grad_norm": 0.1053439223761662, "learning_rate": 1.9982887175225135e-06, "loss": 0.0, "num_tokens": 4833833.0, "reward": 0.6796875, "reward_std": 0.20463991165161133, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 53 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 5.691829197043768e-09, "advantages/std": 0.5726856589317322, "advantages/var": 0.3279688639460723, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 0.15483870967741936, "grad_norm": 0.14214478632458308, "learning_rate": 1.9982222857904287e-06, "loss": -0.0, "num_tokens": 4920722.0, "reward": 0.6875, "reward_std": 0.16333173215389252, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 54 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.0349153895649778e-08, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.15770609318996415, "grad_norm": 0.07374782139639248, "learning_rate": 1.9981545902065973e-06, "loss": 0.0, "num_tokens": 5007022.0, "reward": 0.5859375, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.5859375, "rewards/drgrpo_math_reward/std": 0.49449479579925537, "step": 55 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962814881146116e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.16057347670250896, "grad_norm": 0.062216704713937636, "learning_rate": 1.998085630856728e-06, "loss": -0.0, "num_tokens": 5086937.0, "reward": 0.7109375, "reward_std": 0.1054728701710701, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 56 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.041880175145966e-10, "advantages/std": 0.6612740755081177, "advantages/var": 0.4372834029391157, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.16344086021505377, "grad_norm": 0.1276968079977411, "learning_rate": 1.998015407828131e-06, "loss": 0.0, "num_tokens": 5187209.0, "reward": 0.625, "reward_std": 0.20964756608009338, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 57 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675983786582947, "advantages/var": 0.21864824372386593, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 0.16630824372759856, "grad_norm": 0.08001414737929904, "learning_rate": 1.997943921209715e-06, "loss": -0.0, "num_tokens": 5288667.0, "reward": 0.5546875, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.5546875, "rewards/drgrpo_math_reward/std": 0.4989531338214874, "step": 58 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814195036226835e-09, "advantages/std": 0.5227980017662048, "advantages/var": 0.2733177506507367, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.16917562724014337, "grad_norm": 0.13591135236989843, "learning_rate": 1.997871171091991e-06, "loss": -0.0, "num_tokens": 5378634.0, "reward": 0.703125, "reward_std": 0.14571532607078552, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 59 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 6.005323810328915e-09, "advantages/std": 0.7754141092300415, "advantages/var": 0.6012670407930187, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.17204301075268819, "grad_norm": 0.15618674804291033, "learning_rate": 1.9977971575670664e-06, "loss": 0.0, "num_tokens": 5474256.0, "reward": 0.6875, "reward_std": 0.2885475754737854, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 60 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.17491039426523297, "grad_norm": 0.1138327127178479, "learning_rate": 1.9977218807286505e-06, "loss": -0.0, "num_tokens": 5563557.0, "reward": 0.5, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.5, "rewards/drgrpo_math_reward/std": 0.5019646286964417, "step": 61 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.011196882346898e-09, "advantages/std": 0.618573009967804, "advantages/var": 0.3826325686606289, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.703125, "epoch": 0.17777777777777778, "grad_norm": 0.08132114696040446, "learning_rate": 1.997645340672052e-06, "loss": -0.0, "num_tokens": 5663385.0, "reward": 0.765625, "reward_std": 0.18885357677936554, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 62 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.300980052408865e-08, "advantages/std": 0.5726898312568665, "advantages/var": 0.3279736428250182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.18064516129032257, "grad_norm": 0.10257084432126386, "learning_rate": 1.9975675374941777e-06, "loss": 0.0, "num_tokens": 5747683.0, "reward": 0.84375, "reward_std": 0.16675157845020294, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 63 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.0224889366485245e-09, "advantages/std": 0.6185632348060608, "advantages/var": 0.3826204754537379, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.18351254480286738, "grad_norm": 0.23992414863040273, "learning_rate": 1.9974884712935348e-06, "loss": 0.0, "num_tokens": 5822682.0, "reward": 0.8671875, "reward_std": 0.17859892547130585, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 64 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125814501076877e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 0.1863799283154122, "grad_norm": 0.11197749985688031, "learning_rate": 1.9974081421702293e-06, "loss": 0.0, "num_tokens": 5923545.0, "reward": 0.6484375, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 65 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.562938529588136e-09, "advantages/std": 0.5227833986282349, "advantages/var": 0.2733024818812879, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 0.18924731182795698, "grad_norm": 0.11472039064281611, "learning_rate": 1.997326550225966e-06, "loss": -0.0, "num_tokens": 6024457.0, "reward": 0.3828125, "reward_std": 0.13098490238189697, "rewards/drgrpo_math_reward/mean": 0.3828125, "rewards/drgrpo_math_reward/std": 0.4879830479621887, "step": 66 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.504965933612274e-09, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.1921146953405018, "grad_norm": 0.1418423341425889, "learning_rate": 1.9972436955640485e-06, "loss": -0.0, "num_tokens": 6104953.0, "reward": 0.734375, "reward_std": 0.16097761690616608, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 67 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.1949820788530466, "grad_norm": 0.0804894342413209, "learning_rate": 1.9971595782893793e-06, "loss": 0.0, "num_tokens": 6186002.0, "reward": 0.828125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 68 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.13121587994051e-09, "advantages/std": 0.5726834535598755, "advantages/var": 0.32796633798126607, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.1978494623655914, "grad_norm": 0.11992450664488502, "learning_rate": 1.99707419850846e-06, "loss": 0.0, "num_tokens": 6273682.0, "reward": 0.7109375, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 69 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 8.90714930920747e-09, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "epoch": 0.2007168458781362, "grad_norm": 0.09013588232439032, "learning_rate": 1.9969875563293894e-06, "loss": -0.0, "num_tokens": 6370954.0, "reward": 0.53125, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.53125, "rewards/drgrpo_math_reward/std": 0.5009832978248596, "step": 70 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 5.633244590331673e-09, "advantages/std": 0.33065226674079895, "advantages/var": 0.10933092150082846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.203584229390681, "grad_norm": 0.08158329080487958, "learning_rate": 1.996899651861866e-06, "loss": 0.0, "num_tokens": 6459977.0, "reward": 0.796875, "reward_std": 0.06233953312039375, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 71 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 5.03866787895554e-09, "advantages/std": 0.7393403053283691, "advantages/var": 0.5466240870830461, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 0.2064516129032258, "grad_norm": 0.16017306287798955, "learning_rate": 1.996810485217186e-06, "loss": -0.0, "num_tokens": 6558404.0, "reward": 0.640625, "reward_std": 0.28223681449890137, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 72 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.958683201273463e-10, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.20931899641577062, "grad_norm": 0.08280502539118079, "learning_rate": 1.9967200565082424e-06, "loss": -0.0, "num_tokens": 6640957.0, "reward": 0.6796875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 73 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 8.630640087964663e-09, "advantages/std": 0.7014076113700867, "advantages/var": 0.49197263728789054, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 0.2121863799283154, "grad_norm": 0.15761095907087838, "learning_rate": 1.9966283658495283e-06, "loss": -0.0, "num_tokens": 6743334.0, "reward": 0.6171875, "reward_std": 0.2603819966316223, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.4879830479621887, "step": 74 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 2.816762225733049e-09, "advantages/std": 0.6612716913223267, "advantages/var": 0.4372802497442905, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.21505376344086022, "grad_norm": 0.19829770317655657, "learning_rate": 1.996535413357133e-06, "loss": 0.0, "num_tokens": 6831502.0, "reward": 0.75, "reward_std": 0.20517179369926453, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 75 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.408361690336558e-09, "advantages/std": 0.6612808108329773, "advantages/var": 0.4372923107759199, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 0.21792114695340503, "grad_norm": 0.18264161110922233, "learning_rate": 1.9964411991487446e-06, "loss": -0.0, "num_tokens": 6926494.0, "reward": 0.6640625, "reward_std": 0.21542644500732422, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 76 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 1.126665165824288e-08, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 0.22078853046594982, "grad_norm": 0.03928779024896764, "learning_rate": 1.9963457233436466e-06, "loss": -0.0, "num_tokens": 7017039.0, "reward": 0.71875, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 77 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 2.5193428761467165e-09, "advantages/std": 0.739337682723999, "advantages/var": 0.5466202090956926, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.22365591397849463, "grad_norm": 0.2506819296373822, "learning_rate": 1.9962489860627224e-06, "loss": 0.0, "num_tokens": 7110808.0, "reward": 0.6484375, "reward_std": 0.2767002582550049, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 78 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.225131532618201e-09, "advantages/std": 0.6612735390663147, "advantages/var": 0.43728269346928883, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.22652329749103942, "grad_norm": 0.1557864040641026, "learning_rate": 1.9961509874284507e-06, "loss": -0.0, "num_tokens": 7196850.0, "reward": 0.6796875, "reward_std": 0.20858672261238098, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 79 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 9.958501673983143e-09, "advantages/std": 0.4676017463207245, "advantages/var": 0.21865139316219118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 0.22939068100358423, "grad_norm": 0.09767097714609889, "learning_rate": 1.9960517275649076e-06, "loss": 0.0, "num_tokens": 7283197.0, "reward": 0.6328125, "reward_std": 0.11336850374937057, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 80 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.23225806451612904, "grad_norm": 0.07097210651744987, "learning_rate": 1.995951206597767e-06, "loss": -0.0, "num_tokens": 7373022.0, "reward": 0.75, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 81 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.562875972037933e-09, "advantages/std": 0.5227925777435303, "advantages/var": 0.27331207934372515, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "epoch": 0.23512544802867383, "grad_norm": 0.07965713683163471, "learning_rate": 1.9958494246542984e-06, "loss": 0.0, "num_tokens": 7467194.0, "reward": 0.6484375, "reward_std": 0.13782459497451782, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 82 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125995678848164e-09, "advantages/std": 0.5227746963500977, "advantages/var": 0.2732933831439368, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.23799283154121864, "grad_norm": 0.18075885139952805, "learning_rate": 1.9957463818633678e-06, "loss": 0.0, "num_tokens": 7547836.0, "reward": 0.640625, "reward_std": 0.12179600447416306, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 83 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.504965933612274e-09, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.609375, "epoch": 0.24086021505376345, "grad_norm": 0.11479922684896822, "learning_rate": 1.9956420783554387e-06, "loss": 0.0, "num_tokens": 7651354.0, "reward": 0.625, "reward_std": 0.16097760200500488, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 84 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 8.94433653701192e-09, "advantages/std": 0.5726835131645203, "advantages/var": 0.32796640625045725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.24372759856630824, "grad_norm": 0.06924948592705259, "learning_rate": 1.9955365142625695e-06, "loss": 0.0, "num_tokens": 7740603.0, "reward": 0.8046875, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 85 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.0112157424628863e-09, "advantages/std": 0.6185691356658936, "advantages/var": 0.3826277755984506, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 0.24659498207885305, "grad_norm": 0.11054180140229349, "learning_rate": 1.9954296897184152e-06, "loss": -0.0, "num_tokens": 7833309.0, "reward": 0.671875, "reward_std": 0.18543371558189392, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 86 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.971194650918909e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 0.24946236559139784, "grad_norm": 0.07139527483113263, "learning_rate": 1.9953216048582267e-06, "loss": 0.0, "num_tokens": 7920489.0, "reward": 0.65625, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 87 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.505166341645741e-09, "advantages/std": 0.5726664066314697, "advantages/var": 0.32794681328419983, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.2523297491039427, "grad_norm": 0.0812733661592712, "learning_rate": 1.99521225981885e-06, "loss": -0.0, "num_tokens": 8009754.0, "reward": 0.5859375, "reward_std": 0.13941730558872223, "rewards/drgrpo_math_reward/mean": 0.5859375, "rewards/drgrpo_math_reward/std": 0.49449479579925537, "step": 88 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.252425081488457e-09, "advantages/std": 0.5726942420005798, "advantages/var": 0.3279786948206187, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.25519713261648747, "grad_norm": 0.1334826079479053, "learning_rate": 1.9951016547387284e-06, "loss": -0.0, "num_tokens": 8094420.0, "reward": 0.6875, "reward_std": 0.17358146607875824, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 89 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.979305421724224e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.671875, "epoch": 0.25806451612903225, "grad_norm": 0.11149895094992536, "learning_rate": 1.994989789757898e-06, "loss": -0.0, "num_tokens": 8187321.0, "reward": 0.5546875, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.5546875, "rewards/drgrpo_math_reward/std": 0.4989531338214874, "step": 90 }, { "advantages/mean": 6.05359673500061e-09, "advantages/snr": 1.0570293504372942e-08, "advantages/std": 0.5726990103721619, "advantages/var": 0.32798415648125356, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.26093189964157704, "grad_norm": 0.1503200281820486, "learning_rate": 1.9948766650179924e-06, "loss": -0.0, "num_tokens": 8277097.0, "reward": 0.5078125, "reward_std": 0.17806214094161987, "rewards/drgrpo_math_reward/mean": 0.5078125, "rewards/drgrpo_math_reward/std": 0.5019033551216125, "step": 91 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.2637992831541219, "grad_norm": 0.08503746578744582, "learning_rate": 1.9947622806622382e-06, "loss": -0.0, "num_tokens": 8377236.0, "reward": 0.75, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 92 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.2674839170101503e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.26666666666666666, "grad_norm": 0.05712285492918599, "learning_rate": 1.994646636835458e-06, "loss": -0.0, "num_tokens": 8462836.0, "reward": 0.8359375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 93 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.958849501312727e-10, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.26953405017921145, "grad_norm": 0.07317141316141637, "learning_rate": 1.9945297336840688e-06, "loss": -0.0, "num_tokens": 8536756.0, "reward": 0.8125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 94 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344599133202926e-09, "advantages/std": 0.5227646827697754, "advantages/var": 0.2732829135513839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.2724014336917563, "grad_norm": 0.07050755865291349, "learning_rate": 1.994411571356082e-06, "loss": 0.0, "num_tokens": 8627122.0, "reward": 0.7890625, "reward_std": 0.11048543453216553, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 95 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.78141726951285e-09, "advantages/std": 0.5227986574172974, "advantages/var": 0.27331843619732865, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.2752688172043011, "grad_norm": 0.09077858932382432, "learning_rate": 1.9942921500011035e-06, "loss": 0.0, "num_tokens": 8704876.0, "reward": 0.8671875, "reward_std": 0.14677615463733673, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 96 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 1.0349232344696665e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 0.27813620071684586, "grad_norm": 0.06366679062939669, "learning_rate": 1.994171469770333e-06, "loss": 0.0, "num_tokens": 8788584.0, "reward": 0.734375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 97 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5056352916096475e-09, "advantages/std": 0.618557870388031, "advantages/var": 0.38261383901897617, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.2810035842293907, "grad_norm": 0.14557426372001275, "learning_rate": 1.994049530816563e-06, "loss": 0.0, "num_tokens": 8874456.0, "reward": 0.8046875, "reward_std": 0.17282496392726898, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 98 }, { "advantages/mean": -9.313225746154785e-09, "advantages/snr": 1.4083628328234916e-08, "advantages/std": 0.6612802743911743, "advantages/var": 0.4372916012988668, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.2838709677419355, "grad_norm": 0.17422381430621553, "learning_rate": 1.993926333294182e-06, "loss": 0.0, "num_tokens": 8963952.0, "reward": 0.796875, "reward_std": 0.21436560153961182, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 99 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 0.2867383512544803, "grad_norm": 0.07608279904949532, "learning_rate": 1.9938018773591697e-06, "loss": 0.0, "num_tokens": 9039636.0, "reward": 0.75, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 100 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.011171929325739e-09, "advantages/std": 0.6185781359672546, "advantages/var": 0.38263891029672337, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.2896057347670251, "grad_norm": 0.13534176496966346, "learning_rate": 1.9936761631691005e-06, "loss": -0.0, "num_tokens": 9122751.0, "reward": 0.8125, "reward_std": 0.19780512154102325, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 101 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.327802856091802e-09, "advantages/std": 0.7014012336730957, "advantages/var": 0.4919636905981406, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.2924731182795699, "grad_norm": 0.17364832338354966, "learning_rate": 1.993549190883142e-06, "loss": 0.0, "num_tokens": 9210361.0, "reward": 0.5625, "reward_std": 0.25460314750671387, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49802759289741516, "step": 102 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.757447884868554e-09, "advantages/std": 0.572684109210968, "advantages/var": 0.32796708894275994, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.2953405017921147, "grad_norm": 0.1186100841602063, "learning_rate": 1.9934209606620533e-06, "loss": 0.0, "num_tokens": 9287648.0, "reward": 0.796875, "reward_std": 0.16097761690616608, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 103 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 0.2982078853046595, "grad_norm": 0.05043194574683359, "learning_rate": 1.993291472668187e-06, "loss": -0.0, "num_tokens": 9365932.0, "reward": 0.6953125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 104 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.439391676409106e-09, "advantages/std": 0.5726771354675293, "advantages/var": 0.3279591014872949, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.3010752688172043, "grad_norm": 0.1576253855421573, "learning_rate": 1.993160727065489e-06, "loss": 0.0, "num_tokens": 9438450.0, "reward": 0.78125, "reward_std": 0.1530819982290268, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 105 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.757241732624561e-09, "advantages/std": 0.5726962089538574, "advantages/var": 0.3279809477501203, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 0.3039426523297491, "grad_norm": 0.09728783396370186, "learning_rate": 1.9930287240194956e-06, "loss": 0.0, "num_tokens": 9533008.0, "reward": 0.5546875, "reward_std": 0.17358636856079102, "rewards/drgrpo_math_reward/mean": 0.5546875, "rewards/drgrpo_math_reward/std": 0.4989531338214874, "step": 106 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.493752592678455e-08, "advantages/std": 0.46760883927345276, "advantages/var": 0.21865802656666578, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.3068100358422939, "grad_norm": 0.15227602098741566, "learning_rate": 1.992895463697337e-06, "loss": -0.0, "num_tokens": 9614514.0, "reward": 0.7890625, "reward_std": 0.11914245784282684, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 107 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917699002625455e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.3096774193548387, "grad_norm": 0.08145354636826882, "learning_rate": 1.992760946267734e-06, "loss": -0.0, "num_tokens": 9699069.0, "reward": 0.78125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 108 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907308748317195e-09, "advantages/std": 0.5227856040000916, "advantages/var": 0.27330478774974054, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.3125448028673835, "grad_norm": 0.08781585258601152, "learning_rate": 1.9926251719009997e-06, "loss": 0.0, "num_tokens": 9788484.0, "reward": 0.671875, "reward_std": 0.13098981976509094, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 109 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.3010125501632784e-08, "advantages/std": 0.5726755261421204, "advantages/var": 0.3279572582421544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 0.3154121863799283, "grad_norm": 0.08383385156663044, "learning_rate": 1.9924881407690383e-06, "loss": 0.0, "num_tokens": 9886743.0, "reward": 0.6875, "reward_std": 0.15072788298130035, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 110 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344329800322181e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.31827956989247314, "grad_norm": 0.12773517568175324, "learning_rate": 1.9923498530453453e-06, "loss": -0.0, "num_tokens": 9972379.0, "reward": 0.8359375, "reward_std": 0.13888053596019745, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 111 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5055823378090512e-09, "advantages/std": 0.618579626083374, "advantages/var": 0.3826407538054468, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.3211469534050179, "grad_norm": 0.1096834650176715, "learning_rate": 1.992210308905007e-06, "loss": 0.0, "num_tokens": 10064282.0, "reward": 0.703125, "reward_std": 0.2001592218875885, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 112 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814215346364857e-09, "advantages/std": 0.5227974057197571, "advantages/var": 0.2733171274273083, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.3240143369175627, "grad_norm": 0.14542735807746063, "learning_rate": 1.992069508524701e-06, "loss": 0.0, "num_tokens": 10156504.0, "reward": 0.6171875, "reward_std": 0.14465448260307312, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.4879830479621887, "step": 113 }, { "advantages/mean": 5.587935447692871e-09, "advantages/snr": 8.45011606469443e-09, "advantages/std": 0.6612850427627563, "advantages/var": 0.4372979077817405, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.32688172043010755, "grad_norm": 0.1774709869930943, "learning_rate": 1.9919274520826937e-06, "loss": -0.0, "num_tokens": 10237639.0, "reward": 0.5625, "reward_std": 0.22331714630126953, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49802759289741516, "step": 114 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.449820789068217e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.32974910394265233, "grad_norm": 0.11207851970747312, "learning_rate": 1.991784139758845e-06, "loss": 0.0, "num_tokens": 10308114.0, "reward": 0.8203125, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 115 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.149905988405531e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 0.3326164874551971, "grad_norm": 0.061326552584535385, "learning_rate": 1.9916395717346014e-06, "loss": 0.0, "num_tokens": 10389388.0, "reward": 0.7421875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 116 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299897960206267e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.33548387096774196, "grad_norm": 0.17403189241869, "learning_rate": 1.991493748193002e-06, "loss": -0.0, "num_tokens": 10475315.0, "reward": 0.75, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 117 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299880526045478e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.33835125448028674, "grad_norm": 0.05760583506548298, "learning_rate": 1.991346669318674e-06, "loss": 0.0, "num_tokens": 10547767.0, "reward": 0.9609375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 118 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 7.527911689045256e-09, "advantages/std": 0.618579626083374, "advantages/var": 0.3826407538054468, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.34121863799283153, "grad_norm": 0.13250194607386148, "learning_rate": 1.991198335297834e-06, "loss": 0.0, "num_tokens": 10639778.0, "reward": 0.6875, "reward_std": 0.2001592367887497, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 119 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814298618414938e-09, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.34408602150537637, "grad_norm": 0.11855103268565774, "learning_rate": 1.9910487463182875e-06, "loss": 0.0, "num_tokens": 10722543.0, "reward": 0.8125, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 120 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6722127326138678e-09, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.34695340501792116, "grad_norm": 0.11898391996372083, "learning_rate": 1.990897902569431e-06, "loss": 0.0, "num_tokens": 10807430.0, "reward": 0.8671875, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 121 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.763904706701757e-09, "advantages/std": 0.6185880303382874, "advantages/var": 0.3826511512778019, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 0.34982078853046594, "grad_norm": 0.13013435976302207, "learning_rate": 1.990745804242247e-06, "loss": 0.0, "num_tokens": 10908671.0, "reward": 0.3984375, "reward_std": 0.21146979928016663, "rewards/drgrpo_math_reward/mean": 0.3984375, "rewards/drgrpo_math_reward/std": 0.4915000796318054, "step": 122 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917366402546925e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "epoch": 0.35268817204301073, "grad_norm": 0.07003152151091946, "learning_rate": 1.9905924515293086e-06, "loss": 0.0, "num_tokens": 10998970.0, "reward": 0.5546875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.5546875, "rewards/drgrpo_math_reward/std": 0.4989531338214874, "step": 123 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 1.034946312888557e-08, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.35555555555555557, "grad_norm": 0.16702168626513209, "learning_rate": 1.990437844624775e-06, "loss": -0.0, "num_tokens": 11077854.0, "reward": 0.7265625, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 124 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 6.005250875554911e-09, "advantages/std": 0.775423526763916, "advantages/var": 0.6012816458589896, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.35842293906810035, "grad_norm": 0.22470812589114758, "learning_rate": 1.990281983724395e-06, "loss": -0.0, "num_tokens": 11156158.0, "reward": 0.8125, "reward_std": 0.30221718549728394, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 125 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.36129032258064514, "grad_norm": 0.07239011287954675, "learning_rate": 1.9901248690255043e-06, "loss": 0.0, "num_tokens": 11242573.0, "reward": 0.6328125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 126 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.504917864602565e-09, "advantages/std": 0.5726882815361023, "advantages/var": 0.32797186780877396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.36415770609319, "grad_norm": 0.11009253074124153, "learning_rate": 1.989966500727026e-06, "loss": 0.0, "num_tokens": 11330169.0, "reward": 0.6875, "reward_std": 0.1643974632024765, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 127 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 0.36702508960573477, "grad_norm": 0.11712893123786251, "learning_rate": 1.989806879029471e-06, "loss": 0.0, "num_tokens": 11421447.0, "reward": 0.65625, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 128 }, { "advantages/mean": -9.778887033462524e-09, "advantages/snr": 1.3226438138076389e-08, "advantages/std": 0.7393439412117004, "advantages/var": 0.5466294634064504, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "epoch": 0.36989247311827955, "grad_norm": 0.18004967601954425, "learning_rate": 1.9896460041349366e-06, "loss": -0.0, "num_tokens": 11528509.0, "reward": 0.5, "reward_std": 0.2858891487121582, "rewards/drgrpo_math_reward/mean": 0.5, "rewards/drgrpo_math_reward/std": 0.5019646286964417, "step": 129 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.907149309207469e-10, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 0.3727598566308244, "grad_norm": 0.1020692604987898, "learning_rate": 1.989483876247107e-06, "loss": 0.0, "num_tokens": 11621139.0, "reward": 0.609375, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.4898075461387634, "step": 130 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899474012416308e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.3756272401433692, "grad_norm": 0.07865944799990532, "learning_rate": 1.9893204955712522e-06, "loss": 0.0, "num_tokens": 11694715.0, "reward": 0.8203125, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 131 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599796258942519e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.37849462365591396, "grad_norm": 0.06802770153495377, "learning_rate": 1.98915586231423e-06, "loss": 0.0, "num_tokens": 11770514.0, "reward": 0.8125, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 132 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.3813620071684588, "grad_norm": 0.08913216962341508, "learning_rate": 1.9889899766844814e-06, "loss": 0.0, "num_tokens": 11859752.0, "reward": 0.609375, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.4898075461387634, "step": 133 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.691872442631884e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.3842293906810036, "grad_norm": 0.07666960009104674, "learning_rate": 1.9888228388920358e-06, "loss": 0.0, "num_tokens": 11948859.0, "reward": 0.640625, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 134 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.3870967741935484, "grad_norm": 0.06749265175035522, "learning_rate": 1.9886544491485064e-06, "loss": 0.0, "num_tokens": 12030381.0, "reward": 0.7890625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 135 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.661277174949646, "advantages/var": 0.4372875021093847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.3899641577060932, "grad_norm": 0.09757012597819024, "learning_rate": 1.988484807667092e-06, "loss": 0.0, "num_tokens": 12129427.0, "reward": 0.7421875, "reward_std": 0.2120065838098526, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 136 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.2470284853558477e-08, "advantages/std": 0.5227833986282349, "advantages/var": 0.2733024818812879, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 0.392831541218638, "grad_norm": 0.12809089614582359, "learning_rate": 1.9883139146625762e-06, "loss": 0.0, "num_tokens": 12201685.0, "reward": 0.9296875, "reward_std": 0.13098490238189697, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 137 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675973355770111, "advantages/var": 0.21864726823871994, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.3956989247311828, "grad_norm": 0.0828590398375662, "learning_rate": 1.988141770351326e-06, "loss": -0.0, "num_tokens": 12283527.0, "reward": 0.75, "reward_std": 0.11100947856903076, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 138 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344599133202926e-09, "advantages/std": 0.5227646827697754, "advantages/var": 0.2732829135513839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 0.3985663082437276, "grad_norm": 0.12077351927878063, "learning_rate": 1.987968374951296e-06, "loss": 0.0, "num_tokens": 12363347.0, "reward": 0.7265625, "reward_std": 0.11048543453216553, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 139 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899627360122966e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.703125, "epoch": 0.4014336917562724, "grad_norm": 0.08131195331107871, "learning_rate": 1.9877937286820203e-06, "loss": -0.0, "num_tokens": 12451379.0, "reward": 0.734375, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 140 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.599658819865184e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.4043010752688172, "grad_norm": 0.10033367759342848, "learning_rate": 1.9876178317646203e-06, "loss": -0.0, "num_tokens": 12522473.0, "reward": 0.8125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 141 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.4394010705092733e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "epoch": 0.407168458781362, "grad_norm": 0.1899150573095275, "learning_rate": 1.9874406844217987e-06, "loss": 0.0, "num_tokens": 12620698.0, "reward": 0.6640625, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 142 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.8994882297977766e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.4100358422939068, "grad_norm": 0.10722039505936129, "learning_rate": 1.9872622868778427e-06, "loss": 0.0, "num_tokens": 12707291.0, "reward": 0.890625, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 143 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6721560650546284e-09, "advantages/std": 0.5227927565574646, "advantages/var": 0.27331226630895245, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.4129032258064516, "grad_norm": 0.18915905416870124, "learning_rate": 1.987082639358622e-06, "loss": 0.0, "num_tokens": 12794653.0, "reward": 0.7578125, "reward_std": 0.1412346363067627, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 144 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516881500847056e-09, "advantages/std": 0.6185612082481384, "advantages/var": 0.3826179683493969, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.4157706093189964, "grad_norm": 0.10301029280811545, "learning_rate": 1.9869017420915886e-06, "loss": 0.0, "num_tokens": 12891348.0, "reward": 0.6875, "reward_std": 0.1751839816570282, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 145 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.0570528935649014e-08, "advantages/std": 0.5726862549781799, "advantages/var": 0.3279695466409329, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.41863799283154124, "grad_norm": 0.10928504117762153, "learning_rate": 1.9867195953057764e-06, "loss": -0.0, "num_tokens": 12968944.0, "reward": 0.6953125, "reward_std": 0.16439256072044373, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 146 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6262611176060706e-09, "advantages/std": 0.5726771354675293, "advantages/var": 0.3279591014872949, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.421505376344086, "grad_norm": 0.14184961527585777, "learning_rate": 1.986536199231803e-06, "loss": 0.0, "num_tokens": 13064416.0, "reward": 0.6875, "reward_std": 0.1530819833278656, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 147 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5629072505384383e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "epoch": 0.4243727598566308, "grad_norm": 0.07271219756156644, "learning_rate": 1.986351554101866e-06, "loss": 0.0, "num_tokens": 13156550.0, "reward": 0.6328125, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 148 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.991689037214316e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.42724014336917565, "grad_norm": 0.14713350490945606, "learning_rate": 1.986165660149745e-06, "loss": 0.0, "num_tokens": 13234866.0, "reward": 0.84375, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 149 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814038649715254e-09, "advantages/std": 0.5228025913238525, "advantages/var": 0.2733225494949352, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.43010752688172044, "grad_norm": 0.1210857791775736, "learning_rate": 1.985978517610801e-06, "loss": 0.0, "num_tokens": 13315749.0, "reward": 0.71875, "reward_std": 0.14913517236709595, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 150 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 0.4329749103942652, "grad_norm": 0.09906878327713253, "learning_rate": 1.9857901267219754e-06, "loss": -0.0, "num_tokens": 13407117.0, "reward": 0.5546875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.5546875, "rewards/drgrpo_math_reward/std": 0.4989531338214874, "step": 151 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.235004516445228e-09, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.43584229390681006, "grad_norm": 0.06565110128546628, "learning_rate": 1.9856004877217905e-06, "loss": -0.0, "num_tokens": 13490088.0, "reward": 0.609375, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.4898075461387634, "step": 152 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6185716986656189, "advantages/var": 0.3826309463900692, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.43870967741935485, "grad_norm": 0.19932970995255095, "learning_rate": 1.9854096008503493e-06, "loss": -0.0, "num_tokens": 13581609.0, "reward": 0.578125, "reward_std": 0.18990948796272278, "rewards/drgrpo_math_reward/mean": 0.578125, "rewards/drgrpo_math_reward/std": 0.4957992732524872, "step": 153 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.344599133202926e-09, "advantages/std": 0.5227646827697754, "advantages/var": 0.2732829135513839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.44157706093189963, "grad_norm": 0.11456953029288426, "learning_rate": 1.9852174663493334e-06, "loss": -0.0, "num_tokens": 13658704.0, "reward": 0.6484375, "reward_std": 0.11048543453216553, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 154 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 9.199247230235001e-09, "advantages/std": 0.40495598316192627, "advantages/var": 0.1639893482986423, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.4444444444444444, "grad_norm": 0.07376191063082546, "learning_rate": 1.9850240844620046e-06, "loss": -0.0, "num_tokens": 13735606.0, "reward": 0.7265625, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 155 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975300560494157e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.609375, "epoch": 0.44731182795698926, "grad_norm": 0.09368775004559605, "learning_rate": 1.9848294554332047e-06, "loss": -0.0, "num_tokens": 13825703.0, "reward": 0.6328125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 156 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975144418133046e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.45017921146953405, "grad_norm": 0.10344565808718491, "learning_rate": 1.9846335795093543e-06, "loss": 0.0, "num_tokens": 13901866.0, "reward": 0.7890625, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 157 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4049627482891083, "advantages/var": 0.16399482750186767, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.45304659498207883, "grad_norm": 0.06445645006262131, "learning_rate": 1.9844364569384516e-06, "loss": 0.0, "num_tokens": 13989135.0, "reward": 0.78125, "reward_std": 0.09127141535282135, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 158 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 1.2674839170101503e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.4559139784946237, "grad_norm": 0.061078732138918075, "learning_rate": 1.984238087970075e-06, "loss": -0.0, "num_tokens": 14079134.0, "reward": 0.6015625, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4915000796318054, "step": 159 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.252482966806137e-09, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 0.45878136200716846, "grad_norm": 0.10308714282902937, "learning_rate": 1.9840384728553785e-06, "loss": -0.0, "num_tokens": 14176262.0, "reward": 0.546875, "reward_std": 0.16097761690616608, "rewards/drgrpo_math_reward/mean": 0.546875, "rewards/drgrpo_math_reward/std": 0.4997538626194, "step": 160 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.03370380844539e-09, "advantages/std": 0.6185652613639832, "advantages/var": 0.3826229825662928, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.46164874551971324, "grad_norm": 0.14498186715974062, "learning_rate": 1.983837611847096e-06, "loss": 0.0, "num_tokens": 14270346.0, "reward": 0.734375, "reward_std": 0.1820138692855835, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 161 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.764028020123724e-09, "advantages/std": 0.6185677647590637, "advantages/var": 0.3826260795990244, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.4645161290322581, "grad_norm": 0.19613298266036122, "learning_rate": 1.9836355051995393e-06, "loss": 0.0, "num_tokens": 14359158.0, "reward": 0.703125, "reward_std": 0.18648964166641235, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 162 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.991766726549734e-09, "advantages/std": 0.4675861597061157, "advantages/var": 0.21863681674871316, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.46738351254480287, "grad_norm": 0.10450024700019477, "learning_rate": 1.9834321531685943e-06, "loss": -0.0, "num_tokens": 14444276.0, "reward": 0.7109375, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 163 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.979230209351863e-09, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 0.47025089605734766, "grad_norm": 0.06584065416507832, "learning_rate": 1.9832275560117267e-06, "loss": 0.0, "num_tokens": 14533311.0, "reward": 0.6640625, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 164 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.7393245100975037, "advantages/var": 0.5466007312309138, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.671875, "epoch": 0.4731182795698925, "grad_norm": 0.21015750556078078, "learning_rate": 1.9830217139879765e-06, "loss": 0.0, "num_tokens": 14634245.0, "reward": 0.5859375, "reward_std": 0.25620073080062866, "rewards/drgrpo_math_reward/mean": 0.5859375, "rewards/drgrpo_math_reward/std": 0.49449479579925537, "step": 165 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.4948747045412387e-08, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.4759856630824373, "grad_norm": 0.09577320769164782, "learning_rate": 1.982814627357962e-06, "loss": 0.0, "num_tokens": 14715730.0, "reward": 0.828125, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 166 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.7014028429985046, "advantages/var": 0.49196594816638495, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.47885304659498207, "grad_norm": 0.15319625735144188, "learning_rate": 1.982606296383875e-06, "loss": 0.0, "num_tokens": 14803933.0, "reward": 0.6484375, "reward_std": 0.25460803508758545, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 167 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.4817204301075269, "grad_norm": 0.06886659174337954, "learning_rate": 1.982396721329485e-06, "loss": 0.0, "num_tokens": 14895037.0, "reward": 0.6875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 168 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.967257841033182e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.4845878136200717, "grad_norm": 0.08315863659595965, "learning_rate": 1.9821859024601343e-06, "loss": 0.0, "num_tokens": 14977313.0, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 169 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.4874551971326165, "grad_norm": 0.07170340697141077, "learning_rate": 1.981973840042742e-06, "loss": -0.0, "num_tokens": 15053244.0, "reward": 0.734375, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 170 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917335935919526e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.49032258064516127, "grad_norm": 0.0928601930313996, "learning_rate": 1.9817605343458004e-06, "loss": -0.0, "num_tokens": 15127424.0, "reward": 0.9375, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 171 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.504965933612274e-09, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 0.4931899641577061, "grad_norm": 0.12236893897937705, "learning_rate": 1.9815459856393767e-06, "loss": -0.0, "num_tokens": 15219173.0, "reward": 0.578125, "reward_std": 0.16097761690616608, "rewards/drgrpo_math_reward/mean": 0.578125, "rewards/drgrpo_math_reward/std": 0.4957992732524872, "step": 172 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262736431211962e-09, "advantages/std": 0.5726727247238159, "advantages/var": 0.32795404964259944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.4960573476702509, "grad_norm": 0.17657832013900868, "learning_rate": 1.9813301941951115e-06, "loss": -0.0, "num_tokens": 15299092.0, "reward": 0.734375, "reward_std": 0.1462520956993103, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 173 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.26953361251084e-09, "advantages/std": 0.618580162525177, "advantages/var": 0.3826414174696744, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.4989247311827957, "grad_norm": 0.09171230585823344, "learning_rate": 1.981113160286219e-06, "loss": 0.0, "num_tokens": 15390886.0, "reward": 0.7265625, "reward_std": 0.2012200653553009, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 174 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726932287216187, "advantages/var": 0.3279775342235922, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.5017921146953405, "grad_norm": 0.11084436188522191, "learning_rate": 1.980894884187486e-06, "loss": -0.0, "num_tokens": 15475430.0, "reward": 0.8671875, "reward_std": 0.172288179397583, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 175 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.907268126346096e-10, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.5046594982078854, "grad_norm": 0.10981123292961002, "learning_rate": 1.9806753661752724e-06, "loss": 0.0, "num_tokens": 15572291.0, "reward": 0.6953125, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 176 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.1579757501173404e-08, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.5075268817204301, "grad_norm": 0.13723730154725833, "learning_rate": 1.980454606527511e-06, "loss": 0.0, "num_tokens": 15644451.0, "reward": 0.8203125, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 177 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983467187183905e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "epoch": 0.5103942652329749, "grad_norm": 0.08112611069058394, "learning_rate": 1.980232605523706e-06, "loss": -0.0, "num_tokens": 15735323.0, "reward": 0.765625, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 178 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962549257704446e-09, "advantages/std": 0.4676070809364319, "advantages/var": 0.21865638214189076, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 0.5132616487455197, "grad_norm": 0.09829344452240353, "learning_rate": 1.9800093634449336e-06, "loss": 0.0, "num_tokens": 15830114.0, "reward": 0.6640625, "reward_std": 0.12019839137792587, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 179 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227997303009033, "advantages/var": 0.27331955800269725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.5161290322580645, "grad_norm": 0.1380835985674579, "learning_rate": 1.9797848805738406e-06, "loss": 0.0, "num_tokens": 15917119.0, "reward": 0.71875, "reward_std": 0.14806944131851196, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 180 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227986574172974, "advantages/var": 0.27331843619732865, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.5189964157706093, "grad_norm": 0.08296385514780048, "learning_rate": 1.9795591571946452e-06, "loss": 0.0, "num_tokens": 16025022.0, "reward": 0.4296875, "reward_std": 0.14677615463733673, "rewards/drgrpo_math_reward/mean": 0.4296875, "rewards/drgrpo_math_reward/std": 0.4969765841960907, "step": 181 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 7.317923683175049e-09, "advantages/std": 0.5726968050003052, "advantages/var": 0.32798163045755757, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 0.5218637992831541, "grad_norm": 0.1014964446395621, "learning_rate": 1.9793321935931374e-06, "loss": 0.0, "num_tokens": 16111335.0, "reward": 0.78125, "reward_std": 0.17464719712734222, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 182 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.6721679467630723e-09, "advantages/std": 0.5227904319763184, "advantages/var": 0.27330983576598555, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.515625, "epoch": 0.524731182795699, "grad_norm": 0.07338541137425778, "learning_rate": 1.979103990056675e-06, "loss": -0.0, "num_tokens": 16216919.0, "reward": 0.515625, "reward_std": 0.13781969249248505, "rewards/drgrpo_math_reward/mean": 0.515625, "rewards/drgrpo_math_reward/std": 0.5017194747924805, "step": 183 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.5275985663082438, "grad_norm": 0.09441859618951794, "learning_rate": 1.9788745468741884e-06, "loss": -0.0, "num_tokens": 16304021.0, "reward": 0.71875, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 184 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983495876754114e-09, "advantages/std": 0.4675905704498291, "advantages/var": 0.2186409415735966, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.5304659498207885, "grad_norm": 0.07701416297176929, "learning_rate": 1.9786438643361754e-06, "loss": 0.0, "num_tokens": 16386297.0, "reward": 0.703125, "reward_std": 0.10205792635679245, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 185 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 0.5333333333333333, "grad_norm": 0.044088504975623034, "learning_rate": 1.978411942734704e-06, "loss": 0.0, "num_tokens": 16464227.0, "reward": 0.75, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 186 }, { "advantages/mean": -8.381903171539307e-09, "advantages/snr": 1.2675324905058625e-08, "advantages/std": 0.661277174949646, "advantages/var": 0.4372875021093847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.5362007168458781, "grad_norm": 0.11815011962199863, "learning_rate": 1.978178782363411e-06, "loss": 0.0, "num_tokens": 16553739.0, "reward": 0.6953125, "reward_std": 0.2120065838098526, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 187 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5056222341621311e-09, "advantages/std": 0.6185632348060608, "advantages/var": 0.3826204754537379, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 0.5390681003584229, "grad_norm": 0.10950900442633554, "learning_rate": 1.9779443835175006e-06, "loss": -0.0, "num_tokens": 16641094.0, "reward": 0.7109375, "reward_std": 0.17859894037246704, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 188 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.8994882297977766e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.640625, "epoch": 0.5419354838709678, "grad_norm": 0.06256870745435893, "learning_rate": 1.977708746493746e-06, "loss": 0.0, "num_tokens": 16739866.0, "reward": 0.4375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.4375, "rewards/drgrpo_math_reward/std": 0.49802759289741516, "step": 189 }, { "advantages/mean": 5.587935447692871e-09, "advantages/snr": 1.1950065676939262e-08, "advantages/std": 0.4676070809364319, "advantages/var": 0.21865638214189076, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 0.5448028673835126, "grad_norm": 0.07119029462239967, "learning_rate": 1.977471871590488e-06, "loss": -0.0, "num_tokens": 16835943.0, "reward": 0.6484375, "reward_std": 0.12019838392734528, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 190 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.493777631223398e-08, "advantages/std": 0.4676010012626648, "advantages/var": 0.21865069638184664, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.5476702508960574, "grad_norm": 0.1244096994086529, "learning_rate": 1.977233759107635e-06, "loss": 0.0, "num_tokens": 16909432.0, "reward": 0.8125, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 191 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 8.049325332145131e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.5505376344086022, "grad_norm": 0.044683376952152215, "learning_rate": 1.9769944093466608e-06, "loss": -0.0, "num_tokens": 16994670.0, "reward": 0.8125, "reward_std": 0.08785156905651093, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 192 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.703125, "epoch": 0.5534050179211469, "grad_norm": 0.12143333829380999, "learning_rate": 1.9767538226106077e-06, "loss": -0.0, "num_tokens": 17083910.0, "reward": 0.5625, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49802759289741516, "step": 193 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5055932184229277e-09, "advantages/std": 0.6185751557350159, "advantages/var": 0.38263522329259914, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.5562724014336917, "grad_norm": 0.11438931886325104, "learning_rate": 1.9765119992040825e-06, "loss": 0.0, "num_tokens": 17176232.0, "reward": 0.8046875, "reward_std": 0.19567854702472687, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 194 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.781469264794068e-09, "advantages/std": 0.5227833986282349, "advantages/var": 0.2733024818812879, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.5591397849462365, "grad_norm": 0.1271093212760164, "learning_rate": 1.9762689394332583e-06, "loss": 0.0, "num_tokens": 17262820.0, "reward": 0.6328125, "reward_std": 0.13098490238189697, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 195 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.5620071684587814, "grad_norm": 0.08724342257879564, "learning_rate": 1.9760246436058746e-06, "loss": 0.0, "num_tokens": 17341241.0, "reward": 0.6875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 196 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5629503101518235e-09, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.5648745519713262, "grad_norm": 0.10609677844903032, "learning_rate": 1.975779112031234e-06, "loss": 0.0, "num_tokens": 17424364.0, "reward": 0.8828125, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 197 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5628117918993365e-09, "advantages/std": 0.5228019952774048, "advantages/var": 0.2733219262660356, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 0.567741935483871, "grad_norm": 0.12751734334843087, "learning_rate": 1.9755323450202054e-06, "loss": -0.0, "num_tokens": 17509905.0, "reward": 0.6953125, "reward_std": 0.14807432889938354, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 198 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917668534980524e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.5706093189964158, "grad_norm": 0.08874879314195035, "learning_rate": 1.9752843428852203e-06, "loss": 0.0, "num_tokens": 17591545.0, "reward": 0.8828125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 199 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.5734767025089605, "grad_norm": 0.07702245326996278, "learning_rate": 1.9750351059402755e-06, "loss": 0.0, "num_tokens": 17686347.0, "reward": 0.640625, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 200 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6261990006727604e-09, "advantages/std": 0.5726990103721619, "advantages/var": 0.32798415648125356, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.5763440860215053, "grad_norm": 0.18258259153790793, "learning_rate": 1.9747846345009303e-06, "loss": 0.0, "num_tokens": 17766586.0, "reward": 0.8203125, "reward_std": 0.17806214094161987, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 201 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.011145235853968e-09, "advantages/std": 0.618583619594574, "advantages/var": 0.3826456944307246, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 0.5792114695340502, "grad_norm": 0.11440188062213137, "learning_rate": 1.9745329288843074e-06, "loss": 0.0, "num_tokens": 17858219.0, "reward": 0.640625, "reward_std": 0.2069891095161438, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 202 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 6.7752778563340424e-09, "advantages/std": 0.6185652613639832, "advantages/var": 0.3826229825662928, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.582078853046595, "grad_norm": 0.1044706273434688, "learning_rate": 1.974279989409092e-06, "loss": 0.0, "num_tokens": 17945123.0, "reward": 0.734375, "reward_std": 0.1820138692855835, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 203 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9835626514248234e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.5849462365591398, "grad_norm": 0.08307099624544009, "learning_rate": 1.9740258163955306e-06, "loss": 0.0, "num_tokens": 18019296.0, "reward": 0.8984375, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 204 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962814881146116e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.5878136200716846, "grad_norm": 0.10386880549735236, "learning_rate": 1.9737704101654332e-06, "loss": 0.0, "num_tokens": 18100689.0, "reward": 0.6484375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 205 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40496495366096497, "advantages/var": 0.1639966136936275, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 0.5906810035842294, "grad_norm": 0.10204646235825936, "learning_rate": 1.9735137710421694e-06, "loss": -0.0, "num_tokens": 18185993.0, "reward": 0.640625, "reward_std": 0.0936255231499672, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 206 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 1.2674839170101503e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 0.5935483870967742, "grad_norm": 0.0425580612992601, "learning_rate": 1.973255899350672e-06, "loss": 0.0, "num_tokens": 18265426.0, "reward": 0.5390625, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.5390625, "rewards/drgrpo_math_reward/std": 0.5004304051399231, "step": 207 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.344384639658041e-09, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "epoch": 0.596415770609319, "grad_norm": 0.12298704045791495, "learning_rate": 1.9729967954174317e-06, "loss": 0.0, "num_tokens": 18354467.0, "reward": 0.71875, "reward_std": 0.13098981976509094, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 208 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.25248601345888e-09, "advantages/std": 0.5726835131645203, "advantages/var": 0.32796640625045725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.5992831541218638, "grad_norm": 0.14043999076034508, "learning_rate": 1.972736459570501e-06, "loss": -0.0, "num_tokens": 18442227.0, "reward": 0.7578125, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 209 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.6021505376344086, "grad_norm": 0.08150097473653, "learning_rate": 1.972474892139492e-06, "loss": 0.0, "num_tokens": 18530843.0, "reward": 0.7109375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 210 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.527905160729771e-10, "advantages/std": 0.618580162525177, "advantages/var": 0.3826414174696744, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 0.6050179211469534, "grad_norm": 0.1087801091982331, "learning_rate": 1.972212093455576e-06, "loss": 0.0, "num_tokens": 18616572.0, "reward": 0.8359375, "reward_std": 0.2012200653553009, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 211 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.4083154633446115e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.6078853046594982, "grad_norm": 0.04257051308921546, "learning_rate": 1.9719480638514825e-06, "loss": -0.0, "num_tokens": 18691487.0, "reward": 0.8203125, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 212 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599795920412534e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 0.610752688172043, "grad_norm": 0.08885381295106154, "learning_rate": 1.9716828036615002e-06, "loss": -0.0, "num_tokens": 18770764.0, "reward": 0.734375, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 213 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 8.449892780067669e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 0.6136200716845878, "grad_norm": 0.07110255927977205, "learning_rate": 1.9714163132214763e-06, "loss": -0.0, "num_tokens": 18848544.0, "reward": 0.8203125, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 214 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.9792225930357895e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.6164874551971327, "grad_norm": 0.1054824330345328, "learning_rate": 1.9711485928688146e-06, "loss": 0.0, "num_tokens": 18926159.0, "reward": 0.8125, "reward_std": 0.11678344756364822, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 215 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.1579601710407989e-08, "advantages/std": 0.5227810740470886, "advantages/var": 0.27330005138182756, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.6193548387096774, "grad_norm": 0.09028662906122077, "learning_rate": 1.9708796429424763e-06, "loss": 0.0, "num_tokens": 19020277.0, "reward": 0.625, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 216 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878573645701288e-09, "advantages/std": 0.5727017521858215, "advantages/var": 0.32798729695671014, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.6222222222222222, "grad_norm": 0.16890927041982728, "learning_rate": 1.9706094637829794e-06, "loss": 0.0, "num_tokens": 19096844.0, "reward": 0.7890625, "reward_std": 0.18253791332244873, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 217 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 7.3180584939108565e-09, "advantages/std": 0.5726862549781799, "advantages/var": 0.3279695466409329, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.625089605734767, "grad_norm": 0.09698327360753146, "learning_rate": 1.9703380557323994e-06, "loss": 0.0, "num_tokens": 19184975.0, "reward": 0.7109375, "reward_std": 0.16439256072044373, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 218 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 5.311215035117412e-09, "advantages/std": 0.7014007568359375, "advantages/var": 0.4919630216900259, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.6279569892473118, "grad_norm": 0.14413514727585514, "learning_rate": 1.970065419134366e-06, "loss": 0.0, "num_tokens": 19277171.0, "reward": 0.7421875, "reward_std": 0.25354230403900146, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 219 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917335935919526e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.6308243727598566, "grad_norm": 0.11398432656810822, "learning_rate": 1.969791554334065e-06, "loss": 0.0, "num_tokens": 19355220.0, "reward": 0.8125, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 220 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.2470341725316569e-08, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.6336917562724015, "grad_norm": 0.06968706164339275, "learning_rate": 1.9695164616782378e-06, "loss": 0.0, "num_tokens": 19442603.0, "reward": 0.9375, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 221 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.408356612639279e-09, "advantages/std": 0.6612831950187683, "advantages/var": 0.43729546401423036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.6365591397849463, "grad_norm": 0.12417487379900514, "learning_rate": 1.969240141515179e-06, "loss": 0.0, "num_tokens": 19536443.0, "reward": 0.7109375, "reward_std": 0.21990221738815308, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 222 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5629027821475993e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 0.6394265232974911, "grad_norm": 0.11190128164642635, "learning_rate": 1.9689625941947394e-06, "loss": -0.0, "num_tokens": 19613563.0, "reward": 0.921875, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 223 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 8.962814881146116e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 0.6422939068100358, "grad_norm": 0.10184327264367528, "learning_rate": 1.9686838200683217e-06, "loss": 0.0, "num_tokens": 19682932.0, "reward": 0.8515625, "reward_std": 0.1054728701710701, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 224 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.131062703721143e-10, "advantages/std": 0.5726942420005798, "advantages/var": 0.3279786948206187, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.6451612903225806, "grad_norm": 0.13523562258217606, "learning_rate": 1.9684038194888825e-06, "loss": -0.0, "num_tokens": 19774164.0, "reward": 0.75, "reward_std": 0.17358146607875824, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 225 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599693686636838e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.6480286738351254, "grad_norm": 0.08049877908916266, "learning_rate": 1.9681225928109316e-06, "loss": 0.0, "num_tokens": 19854147.0, "reward": 0.8984375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 226 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958350617856365e-10, "advantages/std": 0.46760883927345276, "advantages/var": 0.21865802656666578, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 0.6508960573476702, "grad_norm": 0.10117609949737459, "learning_rate": 1.9678401403905304e-06, "loss": -0.0, "num_tokens": 19941487.0, "reward": 0.7578125, "reward_std": 0.11914245784282684, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 227 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.065478038084407e-09, "advantages/std": 0.5727017521858215, "advantages/var": 0.32798729695671014, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 0.6537634408602151, "grad_norm": 0.13237339755186903, "learning_rate": 1.967556462585293e-06, "loss": 0.0, "num_tokens": 20033133.0, "reward": 0.6171875, "reward_std": 0.18253791332244873, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.4879830479621887, "step": 228 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.149905988405531e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.6566308243727599, "grad_norm": 0.08008024137443356, "learning_rate": 1.967271559754384e-06, "loss": 0.0, "num_tokens": 20108285.0, "reward": 0.8046875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 229 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.439373903985093e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.6594982078853047, "grad_norm": 0.09014607484840405, "learning_rate": 1.9669854322585205e-06, "loss": 0.0, "num_tokens": 20192884.0, "reward": 0.796875, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 230 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.131207417015344e-09, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.6623655913978495, "grad_norm": 0.12403951610663785, "learning_rate": 1.9666980804599685e-06, "loss": 0.0, "num_tokens": 20269154.0, "reward": 0.84375, "reward_std": 0.16097761690616608, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 231 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.6652329749103942, "grad_norm": 0.06451461003988601, "learning_rate": 1.966409504722545e-06, "loss": 0.0, "num_tokens": 20346655.0, "reward": 0.6796875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 232 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 6.899267866969296e-09, "advantages/std": 0.4049658179283142, "advantages/var": 0.16399731369034853, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "epoch": 0.668100358422939, "grad_norm": 0.07448062664269792, "learning_rate": 1.9661197054116164e-06, "loss": 0.0, "num_tokens": 20430353.0, "reward": 0.7890625, "reward_std": 0.094686359167099, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 233 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.02242626209624e-09, "advantages/std": 0.6185696721076965, "advantages/var": 0.3826284392514232, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.6709677419354839, "grad_norm": 0.139481185743962, "learning_rate": 1.9658286828940987e-06, "loss": -0.0, "num_tokens": 20522660.0, "reward": 0.6953125, "reward_std": 0.18649455904960632, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 234 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.6738351254480287, "grad_norm": 0.089851647052127, "learning_rate": 1.965536437538456e-06, "loss": 0.0, "num_tokens": 20614712.0, "reward": 0.7265625, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 235 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.6767025089605735, "grad_norm": 0.02091972713635081, "learning_rate": 1.9652429697147003e-06, "loss": 0.0, "num_tokens": 20696075.0, "reward": 0.6640625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 236 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966768334962982e-09, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.6795698924731183, "grad_norm": 0.0821111362604408, "learning_rate": 1.964948279794393e-06, "loss": 0.0, "num_tokens": 20775211.0, "reward": 0.7734375, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 237 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.6824372759856631, "grad_norm": 0.0, "learning_rate": 1.9646523681506414e-06, "loss": 0.0, "num_tokens": 20844029.0, "reward": 0.8125, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 238 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.225206177997688e-09, "advantages/std": 0.6612618565559387, "advantages/var": 0.4372672429358069, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.6853046594982078, "grad_norm": 0.11245554500128893, "learning_rate": 1.9643552351580997e-06, "loss": 0.0, "num_tokens": 20926989.0, "reward": 0.765625, "reward_std": 0.19044628739356995, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 239 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.90714930920747e-09, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 0.6881720430107527, "grad_norm": 0.09191890580990122, "learning_rate": 1.964056881192969e-06, "loss": -0.0, "num_tokens": 21011593.0, "reward": 0.765625, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 240 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.3443115207936544e-09, "advantages/std": 0.5227928161621094, "advantages/var": 0.2733123286307091, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 0.6910394265232975, "grad_norm": 0.12594432298115876, "learning_rate": 1.963757306632996e-06, "loss": 0.0, "num_tokens": 21098836.0, "reward": 0.7109375, "reward_std": 0.1412346363067627, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 241 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.528014692854944e-10, "advantages/std": 0.6185711622238159, "advantages/var": 0.3826302827349224, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.6939068100358423, "grad_norm": 0.15637871712059767, "learning_rate": 1.963456511857472e-06, "loss": 0.0, "num_tokens": 21178936.0, "reward": 0.7265625, "reward_std": 0.18884867429733276, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 242 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998246708054356e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.6967741935483871, "grad_norm": 0.060148920256316504, "learning_rate": 1.9631544972472355e-06, "loss": -0.0, "num_tokens": 21255305.0, "reward": 0.7734375, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 243 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.749701738269761e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 0.6996415770609319, "grad_norm": 0.08951489657922204, "learning_rate": 1.962851263184667e-06, "loss": -0.0, "num_tokens": 21354786.0, "reward": 0.4609375, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.4609375, "rewards/drgrpo_math_reward/std": 0.5004304051399231, "step": 244 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.0688659600644362e-08, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.7025089605734767, "grad_norm": 0.09867558191702931, "learning_rate": 1.9625468100536918e-06, "loss": -0.0, "num_tokens": 21443982.0, "reward": 0.6640625, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 245 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599796258942519e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.7053763440860215, "grad_norm": 0.07725741015662844, "learning_rate": 1.9622411382397793e-06, "loss": 0.0, "num_tokens": 21529714.0, "reward": 0.734375, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 246 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6721804379038284e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "epoch": 0.7082437275985664, "grad_norm": 0.1149381843048165, "learning_rate": 1.9619342481299407e-06, "loss": -0.0, "num_tokens": 21626721.0, "reward": 0.6015625, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4915000796318054, "step": 247 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.563018557708836e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.7111111111111111, "grad_norm": 0.11302622859074207, "learning_rate": 1.9616261401127316e-06, "loss": 0.0, "num_tokens": 21713279.0, "reward": 0.703125, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 248 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.057046292582203e-08, "advantages/std": 0.5726898312568665, "advantages/var": 0.3279736428250182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 0.7139784946236559, "grad_norm": 0.12328874251473232, "learning_rate": 1.9613168145782465e-06, "loss": 0.0, "num_tokens": 21797331.0, "reward": 0.59375, "reward_std": 0.16675156354904175, "rewards/drgrpo_math_reward/mean": 0.59375, "rewards/drgrpo_math_reward/std": 0.4930621087551117, "step": 249 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 7.041958882220529e-09, "advantages/std": 0.6612666845321655, "advantages/var": 0.4372736280721625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.7168458781362007, "grad_norm": 0.11302415709130216, "learning_rate": 1.9610062719181248e-06, "loss": 0.0, "num_tokens": 21883873.0, "reward": 0.734375, "reward_std": 0.19939783215522766, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 250 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 0.7197132616487455, "grad_norm": 0.08517678527092525, "learning_rate": 1.9606945125255447e-06, "loss": 0.0, "num_tokens": 21962730.0, "reward": 0.75, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 251 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.1383618110974619e-08, "advantages/std": 0.5726876854896545, "advantages/var": 0.3279711851114975, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.7225806451612903, "grad_norm": 0.0913818639821884, "learning_rate": 1.9603815367952253e-06, "loss": -0.0, "num_tokens": 22054023.0, "reward": 0.5859375, "reward_std": 0.1633366346359253, "rewards/drgrpo_math_reward/mean": 0.5859375, "rewards/drgrpo_math_reward/std": 0.49449479579925537, "step": 252 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.907505770133387e-10, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 0.7254480286738352, "grad_norm": 0.10096835309765015, "learning_rate": 1.9600673451234268e-06, "loss": 0.0, "num_tokens": 22125045.0, "reward": 0.8828125, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 253 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.701427161693573, "advantages/var": 0.4920000631615018, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "epoch": 0.72831541218638, "grad_norm": 0.13821949511592338, "learning_rate": 1.9597519379079476e-06, "loss": -0.0, "num_tokens": 22236387.0, "reward": 0.5234375, "reward_std": 0.29536473751068115, "rewards/drgrpo_math_reward/mean": 0.5234375, "rewards/drgrpo_math_reward/std": 0.5014128684997559, "step": 254 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 7.966686079472842e-09, "advantages/std": 0.4676085114479065, "advantages/var": 0.2186577199785269, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.7311827956989247, "grad_norm": 0.08267618108141793, "learning_rate": 1.959435315548125e-06, "loss": -0.0, "num_tokens": 22315025.0, "reward": 0.7109375, "reward_std": 0.12232004851102829, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 255 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.7340501792114695, "grad_norm": 0.10328968332091389, "learning_rate": 1.959117478444836e-06, "loss": 0.0, "num_tokens": 22394444.0, "reward": 0.84375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 256 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 7.317958718255826e-09, "advantages/std": 0.5726940631866455, "advantages/var": 0.3279784900092295, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.7369175627240143, "grad_norm": 0.12066286064952439, "learning_rate": 1.9587984270004948e-06, "loss": 0.0, "num_tokens": 22477849.0, "reward": 0.671875, "reward_std": 0.17017142474651337, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 257 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.8996420859237135e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 0.7397849462365591, "grad_norm": 0.13435211185718615, "learning_rate": 1.9584781616190534e-06, "loss": 0.0, "num_tokens": 22554506.0, "reward": 0.9609375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 258 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 0.7426523297491039, "grad_norm": 0.06223946926258825, "learning_rate": 1.9581566827060006e-06, "loss": 0.0, "num_tokens": 22621556.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 259 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958459784009791e-09, "advantages/std": 0.4676037132740021, "advantages/var": 0.21865323266763514, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.7455197132616488, "grad_norm": 0.09677817546646107, "learning_rate": 1.9578339906683615e-06, "loss": 0.0, "num_tokens": 22707125.0, "reward": 0.6484375, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 260 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016434378286722e-09, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.7483870967741936, "grad_norm": 0.10330998272932007, "learning_rate": 1.9575100859146973e-06, "loss": -0.0, "num_tokens": 22787207.0, "reward": 0.765625, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 261 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.4536934733273465e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.7512544802867384, "grad_norm": 0.11445295703033564, "learning_rate": 1.9571849688551045e-06, "loss": 0.0, "num_tokens": 22879810.0, "reward": 0.6875, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 262 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814855139419146e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.7541218637992831, "grad_norm": 0.1431856830147952, "learning_rate": 1.956858639901215e-06, "loss": -0.0, "num_tokens": 22958989.0, "reward": 0.8046875, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 263 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.0026272842061405e-09, "advantages/std": 0.7754230499267578, "advantages/var": 0.6012809063577151, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.7569892473118279, "grad_norm": 0.17368791123183652, "learning_rate": 1.9565310994661943e-06, "loss": -0.0, "num_tokens": 23057071.0, "reward": 0.6640625, "reward_std": 0.30115634202957153, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 264 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.975176026781512e-09, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.7598566308243727, "grad_norm": 0.07708908665544316, "learning_rate": 1.9562023479647423e-06, "loss": -0.0, "num_tokens": 23130250.0, "reward": 0.84375, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 265 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.906938086565493e-10, "advantages/std": 0.5228073596954346, "advantages/var": 0.2733275353517115, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.7627240143369176, "grad_norm": 0.12603516044479746, "learning_rate": 1.955872385813092e-06, "loss": -0.0, "num_tokens": 23221313.0, "reward": 0.75, "reward_std": 0.15596505999565125, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 266 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.7655913978494624, "grad_norm": 0.057732338064037186, "learning_rate": 1.95554121342901e-06, "loss": 0.0, "num_tokens": 23303763.0, "reward": 0.71875, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 267 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.0689055673126508e-08, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.7684587813620072, "grad_norm": 0.09102252601234391, "learning_rate": 1.955208831231794e-06, "loss": 0.0, "num_tokens": 23388152.0, "reward": 0.828125, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 268 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.065623173308489e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.771326164874552, "grad_norm": 0.10980681592679174, "learning_rate": 1.9548752396422735e-06, "loss": 0.0, "num_tokens": 23475335.0, "reward": 0.84375, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 269 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6185775995254517, "advantages/var": 0.38263824663467005, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 0.7741935483870968, "grad_norm": 0.12735926909571454, "learning_rate": 1.9545404390828105e-06, "loss": -0.0, "num_tokens": 23572561.0, "reward": 0.6171875, "reward_std": 0.19674429297447205, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.4879830479621887, "step": 270 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.0688419534073913e-08, "advantages/std": 0.5228027701377869, "advantages/var": 0.2733227364637436, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 0.7770609318996415, "grad_norm": 0.09015350223423653, "learning_rate": 1.9542044299772958e-06, "loss": 0.0, "num_tokens": 23667633.0, "reward": 0.75, "reward_std": 0.15254521369934082, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 271 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.7799283154121864, "grad_norm": 0.070569427294079, "learning_rate": 1.9538672127511523e-06, "loss": 0.0, "num_tokens": 23743563.0, "reward": 0.796875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 272 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.974933067692274e-09, "advantages/std": 0.46761488914489746, "advantages/var": 0.21866368454999474, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.7827956989247312, "grad_norm": 0.12482820251144082, "learning_rate": 1.9535287878313314e-06, "loss": -0.0, "num_tokens": 23811169.0, "reward": 0.8125, "reward_std": 0.12703317403793335, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 273 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.757495615940373e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.785663082437276, "grad_norm": 0.11090556503992859, "learning_rate": 1.953189155646313e-06, "loss": -0.0, "num_tokens": 23895942.0, "reward": 0.84375, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 274 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 9.786451163004381e-09, "advantages/std": 0.6185691356658936, "advantages/var": 0.3826277755984506, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.7885304659498208, "grad_norm": 0.13384615172927988, "learning_rate": 1.952848316626108e-06, "loss": -0.0, "num_tokens": 23994424.0, "reward": 0.578125, "reward_std": 0.18543371558189392, "rewards/drgrpo_math_reward/mean": 0.578125, "rewards/drgrpo_math_reward/std": 0.4957992732524872, "step": 275 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 4.723789219088827e-09, "advantages/std": 0.7393343448638916, "advantages/var": 0.5466152734953198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 0.7913978494623656, "grad_norm": 0.16870213729650185, "learning_rate": 1.9525062712022515e-06, "loss": 0.0, "num_tokens": 24090030.0, "reward": 0.6171875, "reward_std": 0.2698703408241272, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.4879830479621887, "step": 276 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878797063112294e-09, "advantages/std": 0.5726755261421204, "advantages/var": 0.3279572582421544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 0.7942652329749104, "grad_norm": 0.11898481530527014, "learning_rate": 1.952163019807809e-06, "loss": 0.0, "num_tokens": 24164254.0, "reward": 0.75, "reward_std": 0.15072788298130035, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 277 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 7.746009544294893e-09, "advantages/std": 0.6612790822982788, "advantages/var": 0.4372900246852538, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.7971326164874551, "grad_norm": 0.13360700941196924, "learning_rate": 1.951818562877372e-06, "loss": 0.0, "num_tokens": 24261394.0, "reward": 0.671875, "reward_std": 0.21542152762413025, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 278 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.2596376216935218e-09, "advantages/std": 0.7393575310707092, "advantages/var": 0.5466495587509748, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.8, "grad_norm": 0.1580545242588768, "learning_rate": 1.951472900847058e-06, "loss": -0.0, "num_tokens": 24349607.0, "reward": 0.6328125, "reward_std": 0.30744943022727966, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 279 }, { "advantages/mean": -8.381903171539307e-09, "advantages/snr": 1.4635969227931852e-08, "advantages/std": 0.5726920366287231, "advantages/var": 0.32797616881795477, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.8028673835125448, "grad_norm": 0.10266306757635575, "learning_rate": 1.9511260341545107e-06, "loss": 0.0, "num_tokens": 24433195.0, "reward": 0.7109375, "reward_std": 0.1701665222644806, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 280 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.0111426245614905e-09, "advantages/std": 0.618584156036377, "advantages/var": 0.38264635809923675, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.8057347670250896, "grad_norm": 0.13785243356693017, "learning_rate": 1.9507779632388995e-06, "loss": 0.0, "num_tokens": 24512800.0, "reward": 0.7578125, "reward_std": 0.208049938082695, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 281 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562971027883829e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "epoch": 0.8086021505376344, "grad_norm": 0.10594585005660433, "learning_rate": 1.950428688540917e-06, "loss": -0.0, "num_tokens": 24604862.0, "reward": 0.6796875, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 282 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.8114695340501792, "grad_norm": 0.05473787045146346, "learning_rate": 1.9500782105027807e-06, "loss": -0.0, "num_tokens": 24695761.0, "reward": 0.7578125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 283 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 8.131246346616979e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.814336917562724, "grad_norm": 0.18183527260060542, "learning_rate": 1.9497265295682326e-06, "loss": -0.0, "num_tokens": 24780624.0, "reward": 0.828125, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 284 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.504997077293582e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 0.8172043010752689, "grad_norm": 0.11798162586925885, "learning_rate": 1.9493736461825363e-06, "loss": 0.0, "num_tokens": 24864352.0, "reward": 0.828125, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 285 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.609375, "epoch": 0.8200716845878137, "grad_norm": 0.03640262071234597, "learning_rate": 1.9490195607924782e-06, "loss": 0.0, "num_tokens": 24959916.0, "reward": 0.578125, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.578125, "rewards/drgrpo_math_reward/std": 0.4957992732524872, "step": 286 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.8229390681003584, "grad_norm": 0.06500180531661805, "learning_rate": 1.948664273846367e-06, "loss": 0.0, "num_tokens": 25045434.0, "reward": 0.6328125, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 287 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.0224889366485245e-09, "advantages/std": 0.6185632348060608, "advantages/var": 0.3826204754537379, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.8258064516129032, "grad_norm": 0.12945246604514526, "learning_rate": 1.9483077857940326e-06, "loss": 0.0, "num_tokens": 25134360.0, "reward": 0.6328125, "reward_std": 0.17859894037246704, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 288 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958683201273463e-10, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.828673835125448, "grad_norm": 0.07853132739771744, "learning_rate": 1.9479500970868246e-06, "loss": 0.0, "num_tokens": 25214310.0, "reward": 0.8203125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 289 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998246708054356e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.8315412186379928, "grad_norm": 0.09141425024311402, "learning_rate": 1.9475912081776144e-06, "loss": -0.0, "num_tokens": 25302858.0, "reward": 0.5234375, "reward_std": 0.08443662524223328, "rewards/drgrpo_math_reward/mean": 0.5234375, "rewards/drgrpo_math_reward/std": 0.5014128684997559, "step": 290 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1499147049662961e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.8344086021505376, "grad_norm": 0.09590480521856211, "learning_rate": 1.9472311195207915e-06, "loss": -0.0, "num_tokens": 25386803.0, "reward": 0.796875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 291 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9916330589501627e-09, "advantages/std": 0.46761754155158997, "advantages/var": 0.21866616516675297, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.8372759856630825, "grad_norm": 0.11322544781915576, "learning_rate": 1.9468698315722655e-06, "loss": -0.0, "num_tokens": 25468775.0, "reward": 0.7421875, "reward_std": 0.130448117852211, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 292 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.453692965541534e-09, "advantages/std": 0.5227810740470886, "advantages/var": 0.27330005138182756, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.8401433691756273, "grad_norm": 0.08298753538962973, "learning_rate": 1.946507344789464e-06, "loss": -0.0, "num_tokens": 25552470.0, "reward": 0.71875, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 293 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5056146899623504e-09, "advantages/std": 0.6185663342475891, "advantages/var": 0.38262430986450013, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.843010752688172, "grad_norm": 0.12742568170963892, "learning_rate": 1.9461436596313317e-06, "loss": -0.0, "num_tokens": 25640616.0, "reward": 0.65625, "reward_std": 0.1841355264186859, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 294 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.6721649001610904e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 0.8458781362007168, "grad_norm": 0.13611944421932107, "learning_rate": 1.9457787765583325e-06, "loss": 0.0, "num_tokens": 25720686.0, "reward": 0.7734375, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 295 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125686138545943e-09, "advantages/std": 0.5227974057197571, "advantages/var": 0.2733171274273083, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.8487455197132616, "grad_norm": 0.0888802118140364, "learning_rate": 1.945412696032445e-06, "loss": 0.0, "num_tokens": 25809142.0, "reward": 0.7421875, "reward_std": 0.14465448260307312, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 296 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.033674212111923e-09, "advantages/std": 0.6185672879219055, "advantages/var": 0.38262548968706156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.8516129032258064, "grad_norm": 0.11517617807560514, "learning_rate": 1.9450454185171647e-06, "loss": 0.0, "num_tokens": 25886208.0, "reward": 0.7890625, "reward_std": 0.18542881309986115, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 297 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 0.8544802867383513, "grad_norm": 0.09118794291826457, "learning_rate": 1.944676944477503e-06, "loss": 0.0, "num_tokens": 25962561.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 298 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814432667740602e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.8573476702508961, "grad_norm": 0.11843588600056001, "learning_rate": 1.944307274379985e-06, "loss": 0.0, "num_tokens": 26033077.0, "reward": 0.8515625, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 299 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.9792508369915716e-09, "advantages/std": 0.4676017463207245, "advantages/var": 0.21865139316219118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 0.8602150537634409, "grad_norm": 0.06641757839953512, "learning_rate": 1.943936408692652e-06, "loss": -0.0, "num_tokens": 26124611.0, "reward": 0.7578125, "reward_std": 0.11336850374937057, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 300 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917699002625455e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.8630824372759857, "grad_norm": 0.088638599805281, "learning_rate": 1.9435643478850574e-06, "loss": -0.0, "num_tokens": 26213659.0, "reward": 0.75, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 301 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.4536934733273465e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.8659498207885304, "grad_norm": 0.11418468041650211, "learning_rate": 1.9431910924282677e-06, "loss": -0.0, "num_tokens": 26303162.0, "reward": 0.609375, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.4898075461387634, "step": 302 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983349132682101e-09, "advantages/std": 0.4676077961921692, "advantages/var": 0.21865705105969724, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "epoch": 0.8688172043010752, "grad_norm": 0.08943390980926356, "learning_rate": 1.942816642794864e-06, "loss": 0.0, "num_tokens": 26395398.0, "reward": 0.71875, "reward_std": 0.12125921249389648, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 303 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.95862671130252e-09, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.8716845878136201, "grad_norm": 0.10227841262538512, "learning_rate": 1.942440999458937e-06, "loss": 0.0, "num_tokens": 26482466.0, "reward": 0.703125, "reward_std": 0.10888781398534775, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 304 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.4497441148988883e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 0.8745519713261649, "grad_norm": 0.06312615865790253, "learning_rate": 1.9420641628960895e-06, "loss": 0.0, "num_tokens": 26545949.0, "reward": 0.921875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 305 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.9875335558214736e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.8774193548387097, "grad_norm": 0.14769399231288133, "learning_rate": 1.9416861335834354e-06, "loss": 0.0, "num_tokens": 26635345.0, "reward": 0.703125, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 306 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 2.8167388677908054e-09, "advantages/std": 0.661277174949646, "advantages/var": 0.4372875021093847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.8802867383512545, "grad_norm": 0.1719249628407586, "learning_rate": 1.9413069119995994e-06, "loss": 0.0, "num_tokens": 26724453.0, "reward": 0.6953125, "reward_std": 0.2120065838098526, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 307 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.8831541218637993, "grad_norm": 0.051521206027291225, "learning_rate": 1.9409264986247136e-06, "loss": -0.0, "num_tokens": 26810148.0, "reward": 0.625, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 308 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.130935766475913e-10, "advantages/std": 0.5727031826972961, "advantages/var": 0.32798893547161256, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 0.886021505376344, "grad_norm": 0.1586860930519003, "learning_rate": 1.9405448939404213e-06, "loss": 0.0, "num_tokens": 26887753.0, "reward": 0.7421875, "reward_std": 0.1814819872379303, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 309 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5056415302478586e-09, "advantages/std": 0.6185553073883057, "advantages/var": 0.3826106682982413, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.8888888888888888, "grad_norm": 0.11676973725591522, "learning_rate": 1.9401620984298726e-06, "loss": -0.0, "num_tokens": 26975280.0, "reward": 0.8203125, "reward_std": 0.16834920644760132, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 310 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.3361080419982039e-08, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.8917562724014337, "grad_norm": 0.10791082633442746, "learning_rate": 1.9397781125777263e-06, "loss": 0.0, "num_tokens": 27063327.0, "reward": 0.75, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 311 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5628865335481204e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.8946236559139785, "grad_norm": 0.11106150865967176, "learning_rate": 1.9393929368701474e-06, "loss": 0.0, "num_tokens": 27142933.0, "reward": 0.7109375, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 312 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.1383310073066278e-08, "advantages/std": 0.5727031826972961, "advantages/var": 0.32798893547161256, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.8974910394265233, "grad_norm": 0.1308299915232991, "learning_rate": 1.939006571794808e-06, "loss": 0.0, "num_tokens": 27229159.0, "reward": 0.6015625, "reward_std": 0.1814819872379303, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4915000796318054, "step": 313 }, { "advantages/mean": -1.0710209608078003e-08, "advantages/snr": 1.870084863867376e-08, "advantages/std": 0.5727124810218811, "advantages/var": 0.3279995859182385, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "epoch": 0.9003584229390681, "grad_norm": 0.12675411347923377, "learning_rate": 1.9386190178408863e-06, "loss": 0.0, "num_tokens": 27321247.0, "reward": 0.71875, "reward_std": 0.1962025910615921, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 314 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 5.269560016029535e-09, "advantages/std": 0.6185770630836487, "advantages/var": 0.3826375829731923, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 0.9032258064516129, "grad_norm": 0.11983649469404985, "learning_rate": 1.9382302754990644e-06, "loss": 0.0, "num_tokens": 27410537.0, "reward": 0.625, "reward_std": 0.19568344950675964, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 315 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878505609008981e-09, "advantages/std": 0.5727097392082214, "advantages/var": 0.327996445383949, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.671875, "epoch": 0.9060931899641577, "grad_norm": 0.10341377968316127, "learning_rate": 1.9378403452615308e-06, "loss": -0.0, "num_tokens": 27506250.0, "reward": 0.640625, "reward_std": 0.19172681868076324, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 316 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344431558649841e-09, "advantages/std": 0.5227810740470886, "advantages/var": 0.27330005138182756, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.9089605734767026, "grad_norm": 0.09507098139210479, "learning_rate": 1.937449227621977e-06, "loss": -0.0, "num_tokens": 27581956.0, "reward": 0.75, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 317 }, { "advantages/mean": 5.587935447692871e-09, "advantages/snr": 1.1950152502444473e-08, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.703125, "epoch": 0.9118279569892473, "grad_norm": 0.07694112355680065, "learning_rate": 1.937056923075598e-06, "loss": -0.0, "num_tokens": 27675568.0, "reward": 0.7421875, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 318 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.25248601345888e-09, "advantages/std": 0.5726835131645203, "advantages/var": 0.32796640625045725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.9146953405017921, "grad_norm": 0.1045675617841734, "learning_rate": 1.936663432119091e-06, "loss": 0.0, "num_tokens": 27755049.0, "reward": 0.8046875, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 319 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.9175627240143369, "grad_norm": 0.08094397471372518, "learning_rate": 1.936268755250657e-06, "loss": 0.0, "num_tokens": 27833337.0, "reward": 0.8125, "reward_std": 0.09863808006048203, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 320 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.068867300569461e-08, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.9204301075268817, "grad_norm": 0.08290054487635672, "learning_rate": 1.935872892969996e-06, "loss": 0.0, "num_tokens": 27913335.0, "reward": 0.71875, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 321 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6262414834030685e-09, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.9232974910394265, "grad_norm": 0.14424633415032476, "learning_rate": 1.9354758457783118e-06, "loss": -0.0, "num_tokens": 28000444.0, "reward": 0.734375, "reward_std": 0.16097761690616608, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 322 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.0570411218700057e-08, "advantages/std": 0.5726926326751709, "advantages/var": 0.3279768515204182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 0.9261648745519713, "grad_norm": 0.10262016914447541, "learning_rate": 1.9350776141783053e-06, "loss": -0.0, "num_tokens": 28090617.0, "reward": 0.65625, "reward_std": 0.1712273508310318, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 323 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 0.9290322580645162, "grad_norm": 0.06264814038661047, "learning_rate": 1.9346781986741796e-06, "loss": -0.0, "num_tokens": 28167398.0, "reward": 0.8203125, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 324 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 7.528182986834277e-09, "advantages/std": 0.618557333946228, "advantages/var": 0.38261317537866546, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 0.931899641577061, "grad_norm": 0.09011173223308504, "learning_rate": 1.9342775997716356e-06, "loss": -0.0, "num_tokens": 28261498.0, "reward": 0.625, "reward_std": 0.17176413536071777, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 325 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.9347670250896057, "grad_norm": 0.07463006151846577, "learning_rate": 1.933875817977872e-06, "loss": 0.0, "num_tokens": 28333078.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 326 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983533706996105e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 0.9376344086021505, "grad_norm": 0.0995946719653618, "learning_rate": 1.9334728538015857e-06, "loss": 0.0, "num_tokens": 28408055.0, "reward": 0.8828125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 327 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 0.9405017921146953, "grad_norm": 0.04232695230937809, "learning_rate": 1.933068707752972e-06, "loss": 0.0, "num_tokens": 28484536.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 328 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.01628083497369e-09, "advantages/std": 0.5228049755096436, "advantages/var": 0.273325042417639, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 0.9433691756272401, "grad_norm": 0.12948393095836913, "learning_rate": 1.9326633803437195e-06, "loss": 0.0, "num_tokens": 28567115.0, "reward": 0.7421875, "reward_std": 0.1525501012802124, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 329 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.974924309348006e-09, "advantages/std": 0.4676155745983124, "advantages/var": 0.21866432560690985, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.946236559139785, "grad_norm": 0.11737178296005357, "learning_rate": 1.932256872087015e-06, "loss": 0.0, "num_tokens": 28646097.0, "reward": 0.7109375, "reward_std": 0.12809401750564575, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 330 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907227504745508e-10, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.9491039426523298, "grad_norm": 0.09571629711822983, "learning_rate": 1.9318491834975396e-06, "loss": -0.0, "num_tokens": 28731611.0, "reward": 0.703125, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 331 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199317639730369e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.9519713261648746, "grad_norm": 0.13642467920291784, "learning_rate": 1.931440315091469e-06, "loss": 0.0, "num_tokens": 28800915.0, "reward": 0.921875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 332 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.9548387096774194, "grad_norm": 0.053445425827039775, "learning_rate": 1.9310302673864724e-06, "loss": 0.0, "num_tokens": 28879905.0, "reward": 0.9375, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 333 }, { "advantages/mean": -8.381903171539307e-09, "advantages/snr": 1.4635862598936842e-08, "advantages/std": 0.5726962089538574, "advantages/var": 0.3279809477501203, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.9577060931899641, "grad_norm": 0.11806438466355382, "learning_rate": 1.930619040901712e-06, "loss": 0.0, "num_tokens": 28957308.0, "reward": 0.8984375, "reward_std": 0.17358636856079102, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 334 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 0.9605734767025089, "grad_norm": 0.08505699495761096, "learning_rate": 1.930206636157843e-06, "loss": 0.0, "num_tokens": 29038618.0, "reward": 0.7578125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 335 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 3.764028020123724e-09, "advantages/std": 0.6185677647590637, "advantages/var": 0.3826260795990244, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 0.9634408602150538, "grad_norm": 0.14199404018098058, "learning_rate": 1.929793053677012e-06, "loss": -0.0, "num_tokens": 29126806.0, "reward": 0.59375, "reward_std": 0.18648964166641235, "rewards/drgrpo_math_reward/mean": 0.59375, "rewards/drgrpo_math_reward/std": 0.4930621087551117, "step": 336 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.234939828648251e-09, "advantages/std": 0.5228003859519958, "advantages/var": 0.2733202435515558, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 0.9663082437275986, "grad_norm": 0.12599572121905642, "learning_rate": 1.929378293982857e-06, "loss": 0.0, "num_tokens": 29212479.0, "reward": 0.6171875, "reward_std": 0.14913025498390198, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.4879830479621887, "step": 337 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.2525831708228704e-09, "advantages/std": 0.5726664066314697, "advantages/var": 0.32794681328419983, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 0.9691756272401434, "grad_norm": 0.111670356484796, "learning_rate": 1.928962357600506e-06, "loss": 0.0, "num_tokens": 29298954.0, "reward": 0.7734375, "reward_std": 0.13941730558872223, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 338 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 0.9720430107526882, "grad_norm": 0.16320620762576749, "learning_rate": 1.928545245056577e-06, "loss": 0.0, "num_tokens": 29376003.0, "reward": 0.71875, "reward_std": 0.1173202320933342, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 339 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814855139419146e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 0.974910394265233, "grad_norm": 0.08899428414824657, "learning_rate": 1.9281269568791776e-06, "loss": -0.0, "num_tokens": 29464959.0, "reward": 0.6484375, "reward_std": 0.12415502220392227, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 340 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.9777777777777777, "grad_norm": 0.07683186211082983, "learning_rate": 1.9277074935979034e-06, "loss": 0.0, "num_tokens": 29540813.0, "reward": 0.90625, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 341 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199522104181912e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 0.9806451612903225, "grad_norm": 0.11069940579873279, "learning_rate": 1.9272868557438377e-06, "loss": 0.0, "num_tokens": 29613965.0, "reward": 0.8359375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 342 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 0.9835125448028674, "grad_norm": 0.07745781028622259, "learning_rate": 1.9268650438495512e-06, "loss": 0.0, "num_tokens": 29691372.0, "reward": 0.8671875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 343 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 6.9708058921609245e-09, "advantages/std": 0.46761149168014526, "advantages/var": 0.21866050715133056, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 0.9863799283154122, "grad_norm": 0.09177326008077252, "learning_rate": 1.9264420584491013e-06, "loss": -0.0, "num_tokens": 29767997.0, "reward": 0.6875, "reward_std": 0.12255740165710449, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 344 }, { "advantages/mean": 5.122274160385132e-09, "advantages/snr": 9.797750300383811e-09, "advantages/std": 0.5228010416030884, "advantages/var": 0.27332092910127415, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "epoch": 0.989247311827957, "grad_norm": 0.14473041714484297, "learning_rate": 1.9260179000780308e-06, "loss": -0.0, "num_tokens": 29870801.0, "reward": 0.484375, "reward_std": 0.15019109845161438, "rewards/drgrpo_math_reward/mean": 0.484375, "rewards/drgrpo_math_reward/std": 0.5017194747924805, "step": 345 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.053926931635423e-08, "advantages/std": 0.6185683012008667, "advantages/var": 0.38262674325052615, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 0.9921146953405018, "grad_norm": 0.1201267373241665, "learning_rate": 1.9255925692733675e-06, "loss": -0.0, "num_tokens": 29962857.0, "reward": 0.7109375, "reward_std": 0.18755048513412476, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 346 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.3361080419982039e-08, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 0.9949820788530466, "grad_norm": 0.10345892575163858, "learning_rate": 1.925166066573624e-06, "loss": 0.0, "num_tokens": 30042140.0, "reward": 0.875, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 347 }, { "advantages/mean": 7.450580596923828e-09, "advantages/snr": 1.4251481140387377e-08, "advantages/std": 0.5227934122085571, "advantages/var": 0.27331295184866633, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 0.9978494623655914, "grad_norm": 0.09853084355490387, "learning_rate": 1.9247383925187957e-06, "loss": -0.0, "num_tokens": 30136592.0, "reward": 0.640625, "reward_std": 0.1422954797744751, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 348 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.987550502194943e-09, "advantages/std": 0.4676017463207245, "advantages/var": 0.21865139316219118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 1.0028673835125448, "grad_norm": 0.09222286368221157, "learning_rate": 1.924309547650363e-06, "loss": 0.0, "num_tokens": 30226361.0, "reward": 0.7890625, "reward_std": 0.11336850374937057, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 349 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.0057347670250896, "grad_norm": 0.09281704064489878, "learning_rate": 1.9238795325112867e-06, "loss": 0.0, "num_tokens": 30301423.0, "reward": 0.8125, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 350 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.5995914574054795e-09, "advantages/std": 0.4049588143825531, "advantages/var": 0.1639916413461231, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.0086021505376344, "grad_norm": 0.11214316964016033, "learning_rate": 1.9234483476460102e-06, "loss": -0.0, "num_tokens": 30378627.0, "reward": 0.828125, "reward_std": 0.0867956355214119, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 351 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 6.639167740785514e-09, "advantages/std": 0.7013850212097168, "advantages/var": 0.4919409479773549, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.0114695340501791, "grad_norm": 0.14615136733815426, "learning_rate": 1.9230159936004578e-06, "loss": -0.0, "num_tokens": 30465853.0, "reward": 0.59375, "reward_std": 0.22962790727615356, "rewards/drgrpo_math_reward/mean": 0.59375, "rewards/drgrpo_math_reward/std": 0.4930621087551117, "step": 352 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 1.014336917562724, "grad_norm": 0.09620376443450143, "learning_rate": 1.922582470922034e-06, "loss": -0.0, "num_tokens": 30555585.0, "reward": 0.53125, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.53125, "rewards/drgrpo_math_reward/std": 0.5009832978248596, "step": 353 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.9750579720916185e-09, "advantages/std": 0.4676051139831543, "advantages/var": 0.21865454262319872, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.0172043010752687, "grad_norm": 0.09875036348818256, "learning_rate": 1.922147780159623e-06, "loss": 0.0, "num_tokens": 30636497.0, "reward": 0.7578125, "reward_std": 0.11784426867961884, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 354 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1499234216592094e-08, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.0200716845878137, "grad_norm": 0.10569425566424312, "learning_rate": 1.921711921863588e-06, "loss": -0.0, "num_tokens": 30721054.0, "reward": 0.7734375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 355 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.0570579543741361e-08, "advantages/std": 0.5726835131645203, "advantages/var": 0.32796640625045725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.0229390681003585, "grad_norm": 0.09538101147812049, "learning_rate": 1.9212748965857696e-06, "loss": 0.0, "num_tokens": 30809381.0, "reward": 0.7421875, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 356 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.344354173221399e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.0258064516129033, "grad_norm": 0.13437854088249473, "learning_rate": 1.9208367048794875e-06, "loss": 0.0, "num_tokens": 30894319.0, "reward": 0.71875, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 357 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.065577896776649e-09, "advantages/std": 0.5726876854896545, "advantages/var": 0.3279711851114975, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 1.028673835125448, "grad_norm": 0.10619439403681721, "learning_rate": 1.9203973472995368e-06, "loss": 0.0, "num_tokens": 30985248.0, "reward": 0.6328125, "reward_std": 0.1633366346359253, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 358 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 8.049402934764073e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.0315412186379929, "grad_norm": 0.0571904181498558, "learning_rate": 1.9199568244021894e-06, "loss": 0.0, "num_tokens": 31060470.0, "reward": 0.875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 359 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.967079601050182e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.0344086021505376, "grad_norm": 0.07861517559921144, "learning_rate": 1.9195151367451928e-06, "loss": 0.0, "num_tokens": 31140229.0, "reward": 0.890625, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 360 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.971194650918909e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.0372759856630824, "grad_norm": 0.06784142929071403, "learning_rate": 1.9190722848877683e-06, "loss": 0.0, "num_tokens": 31225829.0, "reward": 0.65625, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 361 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.252446745927492e-09, "advantages/std": 0.5726904273033142, "advantages/var": 0.3279743255248526, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.0401433691756272, "grad_norm": 0.10398935060390492, "learning_rate": 1.9186282693906115e-06, "loss": -0.0, "num_tokens": 31309772.0, "reward": 0.6015625, "reward_std": 0.16781240701675415, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4915000796318054, "step": 362 }, { "advantages/mean": 7.916241884231567e-09, "advantages/snr": 1.5141977736956797e-08, "advantages/std": 0.5228010416030884, "advantages/var": 0.27332092910127415, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.043010752688172, "grad_norm": 0.11527620657387849, "learning_rate": 1.9181830908158926e-06, "loss": 0.0, "num_tokens": 31394106.0, "reward": 0.71875, "reward_std": 0.15019109845161438, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 363 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.599761052090956e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.0458781362007168, "grad_norm": 0.1078137751536225, "learning_rate": 1.9177367497272524e-06, "loss": -0.0, "num_tokens": 31473779.0, "reward": 0.6484375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 364 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262414834030685e-09, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.0487455197132616, "grad_norm": 0.09935113929320916, "learning_rate": 1.9172892466898046e-06, "loss": -0.0, "num_tokens": 31554710.0, "reward": 0.703125, "reward_std": 0.16097760200500488, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 365 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.0516129032258064, "grad_norm": 0.12190199299698741, "learning_rate": 1.916840582270134e-06, "loss": 0.0, "num_tokens": 31633384.0, "reward": 0.75, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 366 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726947784423828, "advantages/var": 0.32797930925516994, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.0544802867383511, "grad_norm": 0.0931859343367811, "learning_rate": 1.916390757036296e-06, "loss": 0.0, "num_tokens": 31725141.0, "reward": 0.8046875, "reward_std": 0.17464229464530945, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 367 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.505062750816392e-09, "advantages/std": 0.5726755261421204, "advantages/var": 0.3279572582421544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.0573476702508962, "grad_norm": 0.12607688041530746, "learning_rate": 1.9159397715578158e-06, "loss": 0.0, "num_tokens": 31805654.0, "reward": 0.875, "reward_std": 0.15072786808013916, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 368 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 1.060215053763441, "grad_norm": 0.10770929354909455, "learning_rate": 1.915487626405686e-06, "loss": 0.0, "num_tokens": 31896248.0, "reward": 0.796875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 369 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299811976811062e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.0630824372759857, "grad_norm": 0.07469731951388661, "learning_rate": 1.9150343221523694e-06, "loss": 0.0, "num_tokens": 31965584.0, "reward": 0.8046875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 370 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.131329284837406e-09, "advantages/std": 0.5726754665374756, "advantages/var": 0.3279571899739153, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 1.0659498207885305, "grad_norm": 0.1006389283231685, "learning_rate": 1.914579859371796e-06, "loss": 0.0, "num_tokens": 32051745.0, "reward": 0.6875, "reward_std": 0.15072788298130035, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 371 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814855139419146e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.0688172043010753, "grad_norm": 0.1279205858622629, "learning_rate": 1.914124238639362e-06, "loss": 0.0, "num_tokens": 32147060.0, "reward": 0.7421875, "reward_std": 0.12415502220392227, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 372 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.011311787641491e-09, "advantages/std": 0.6185494065284729, "advantages/var": 0.38260336831672603, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.07168458781362, "grad_norm": 0.1263450693937804, "learning_rate": 1.91366746053193e-06, "loss": -0.0, "num_tokens": 32233181.0, "reward": 0.734375, "reward_std": 0.16151440143585205, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 373 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878778782737308e-09, "advantages/std": 0.5726776719093323, "advantages/var": 0.3279597159034928, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.0745519713261649, "grad_norm": 0.1320136897278803, "learning_rate": 1.913209525627828e-06, "loss": 0.0, "num_tokens": 32318847.0, "reward": 0.8515625, "reward_std": 0.154142826795578, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 374 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.0774193548387097, "grad_norm": 0.03337057156737115, "learning_rate": 1.912750434506848e-06, "loss": 0.0, "num_tokens": 32399870.0, "reward": 0.7421875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 375 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.4497441148988883e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.0802867383512544, "grad_norm": 0.0909052441690691, "learning_rate": 1.912290187750247e-06, "loss": 0.0, "num_tokens": 32481002.0, "reward": 0.734375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 376 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958683201273463e-10, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.0831541218637992, "grad_norm": 0.08128416112795446, "learning_rate": 1.9118287859407446e-06, "loss": -0.0, "num_tokens": 32561635.0, "reward": 0.8203125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 377 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.086021505376344, "grad_norm": 0.0686112977829753, "learning_rate": 1.9113662296625223e-06, "loss": -0.0, "num_tokens": 32633079.0, "reward": 0.7421875, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 378 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199591840825068e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.0888888888888888, "grad_norm": 0.07426607789781772, "learning_rate": 1.9109025195012243e-06, "loss": 0.0, "num_tokens": 32719141.0, "reward": 0.71875, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 379 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878856475277099e-09, "advantages/std": 0.5726685523986816, "advantages/var": 0.3279492709064016, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.0917562724014336, "grad_norm": 0.15765355244245488, "learning_rate": 1.9104376560439544e-06, "loss": 0.0, "num_tokens": 32800653.0, "reward": 0.65625, "reward_std": 0.14283226430416107, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 380 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.0111551007775335e-09, "advantages/std": 0.6185815930366516, "advantages/var": 0.3826431872437617, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.0946236559139786, "grad_norm": 0.1122812046090102, "learning_rate": 1.9099716398792783e-06, "loss": -0.0, "num_tokens": 32880981.0, "reward": 0.6796875, "reward_std": 0.20357418060302734, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 381 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.1499403476539522e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.0974910394265234, "grad_norm": 0.11363141288519696, "learning_rate": 1.90950447159722e-06, "loss": -0.0, "num_tokens": 32959971.0, "reward": 0.6953125, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 382 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149914704966296e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.1003584229390682, "grad_norm": 0.09497901297409572, "learning_rate": 1.9090361517892617e-06, "loss": 0.0, "num_tokens": 33033418.0, "reward": 0.75, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 383 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907175712742112e-09, "advantages/std": 0.5227934122085571, "advantages/var": 0.27331295184866633, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.703125, "epoch": 1.103225806451613, "grad_norm": 0.06460150146892275, "learning_rate": 1.9085666810483457e-06, "loss": -0.0, "num_tokens": 33129949.0, "reward": 0.734375, "reward_std": 0.1422954797744751, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 384 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917480653229804e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.1060931899641577, "grad_norm": 0.12433395722331816, "learning_rate": 1.908096059968869e-06, "loss": 0.0, "num_tokens": 33206078.0, "reward": 0.859375, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 385 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125710510822718e-09, "advantages/std": 0.5227956175804138, "advantages/var": 0.2733152577612863, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.1089605734767025, "grad_norm": 0.1081801352944286, "learning_rate": 1.907624289146686e-06, "loss": -0.0, "num_tokens": 33289958.0, "reward": 0.8515625, "reward_std": 0.14230038225650787, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 386 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5628430692729714e-09, "advantages/std": 0.5227974057197571, "advantages/var": 0.2733171274273083, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.1118279569892473, "grad_norm": 0.09246587460981719, "learning_rate": 1.9071513691791077e-06, "loss": -0.0, "num_tokens": 33380170.0, "reward": 0.7734375, "reward_std": 0.14465448260307312, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 387 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 7.528248275317514e-09, "advantages/std": 0.6185519695281982, "advantages/var": 0.3826065390072131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.114695340501792, "grad_norm": 0.14999799399924482, "learning_rate": 1.9066773006648988e-06, "loss": 0.0, "num_tokens": 33459127.0, "reward": 0.875, "reward_std": 0.1659901738166809, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 388 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.599751911938768e-09, "advantages/std": 0.40494468808174133, "advantages/var": 0.16398020040561878, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 1.1175627240143369, "grad_norm": 0.08925593566328756, "learning_rate": 1.906202084204279e-06, "loss": 0.0, "num_tokens": 33527677.0, "reward": 0.84375, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 389 }, { "advantages/mean": 6.984919309616089e-09, "advantages/snr": 1.3360585347103394e-08, "advantages/std": 0.5228003859519958, "advantages/var": 0.2733202435515558, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.1204301075268817, "grad_norm": 0.11445198267119233, "learning_rate": 1.9057257203989203e-06, "loss": -0.0, "num_tokens": 33619759.0, "reward": 0.5859375, "reward_std": 0.14913025498390198, "rewards/drgrpo_math_reward/mean": 0.5859375, "rewards/drgrpo_math_reward/std": 0.49449479579925537, "step": 390 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 9.85858603726782e-09, "advantages/std": 0.661277174949646, "advantages/var": 0.4372875021093847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.1232974910394264, "grad_norm": 0.17799224247260523, "learning_rate": 1.905248209851949e-06, "loss": 0.0, "num_tokens": 33704395.0, "reward": 0.8046875, "reward_std": 0.2120065987110138, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 391 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 7.96685234149694e-09, "advantages/std": 0.7013981342315674, "advantages/var": 0.4919593427035238, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.1261648745519715, "grad_norm": 0.16809757067938313, "learning_rate": 1.904769553167942e-06, "loss": -0.0, "num_tokens": 33797595.0, "reward": 0.6953125, "reward_std": 0.24883408844470978, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 392 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983429612088697e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.129032258064516, "grad_norm": 0.13776121916921183, "learning_rate": 1.9042897509529277e-06, "loss": -0.0, "num_tokens": 33882619.0, "reward": 0.6796875, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 393 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2524227119579474e-09, "advantages/std": 0.5726946592330933, "advantages/var": 0.3279791727141088, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.131899641577061, "grad_norm": 0.148615746974474, "learning_rate": 1.9038088038143849e-06, "loss": -0.0, "num_tokens": 33962960.0, "reward": 0.8359375, "reward_std": 0.17123225331306458, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 394 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814855139419146e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.1347670250896058, "grad_norm": 0.09902327906445091, "learning_rate": 1.9033267123612417e-06, "loss": 0.0, "num_tokens": 34050097.0, "reward": 0.7421875, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 395 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516894993554525e-09, "advantages/std": 0.6185593605041504, "advantages/var": 0.3826156824673035, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 1.1376344086021506, "grad_norm": 0.15064301550607218, "learning_rate": 1.9028434772038762e-06, "loss": 0.0, "num_tokens": 34148015.0, "reward": 0.7265625, "reward_std": 0.17517907917499542, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 396 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599693686636838e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.1405017921146954, "grad_norm": 0.06188828252462216, "learning_rate": 1.9023590989541126e-06, "loss": 0.0, "num_tokens": 34226218.0, "reward": 0.7734375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 397 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726962685585022, "advantages/var": 0.3279810160208321, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 1.1433691756272402, "grad_norm": 0.12212723663588687, "learning_rate": 1.9018735782252242e-06, "loss": 0.0, "num_tokens": 34325697.0, "reward": 0.6171875, "reward_std": 0.17358636856079102, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.4879830479621887, "step": 398 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299880526045478e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.146236559139785, "grad_norm": 0.06080905471047067, "learning_rate": 1.9013869156319296e-06, "loss": 0.0, "num_tokens": 34404988.0, "reward": 0.8203125, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 399 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 9.29469581896395e-09, "advantages/std": 0.7013955116271973, "advantages/var": 0.4919556637307778, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.578125, "epoch": 1.1491039426523297, "grad_norm": 0.15628603367531402, "learning_rate": 1.9008991117903937e-06, "loss": -0.0, "num_tokens": 34496070.0, "reward": 0.5625, "reward_std": 0.2467075139284134, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49802759289741516, "step": 400 }, { "advantages/mean": 6.05359673500061e-09, "advantages/snr": 1.494855892459685e-08, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.1519713261648745, "grad_norm": 0.09673900792502516, "learning_rate": 1.9004101673182258e-06, "loss": -0.0, "num_tokens": 34585623.0, "reward": 0.6796875, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 401 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.609832486150278e-08, "advantages/std": 0.40496498346328735, "advantages/var": 0.1639966378314206, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.1548387096774193, "grad_norm": 0.05818237777324348, "learning_rate": 1.8999200828344804e-06, "loss": 0.0, "num_tokens": 34666399.0, "reward": 0.875, "reward_std": 0.0936255231499672, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 402 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.131336901697576e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.157706093189964, "grad_norm": 0.10039950957745691, "learning_rate": 1.8994288589596539e-06, "loss": 0.0, "num_tokens": 34755804.0, "reward": 0.7578125, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 403 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.1579550221161443e-08, "advantages/std": 0.5227833986282349, "advantages/var": 0.2733024818812879, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.1605734767025089, "grad_norm": 0.10524671812993915, "learning_rate": 1.8989364963156868e-06, "loss": -0.0, "num_tokens": 34841675.0, "reward": 0.8046875, "reward_std": 0.13098490238189697, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 404 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726904273033142, "advantages/var": 0.3279743255248526, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.1634408602150539, "grad_norm": 0.1312953481263664, "learning_rate": 1.8984429955259604e-06, "loss": 0.0, "num_tokens": 34919375.0, "reward": 0.8359375, "reward_std": 0.16781240701675415, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 405 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46761295199394226, "advantages/var": 0.21866187287248895, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.1663082437275984, "grad_norm": 0.08193295618037837, "learning_rate": 1.8979483572152972e-06, "loss": 0.0, "num_tokens": 35000480.0, "reward": 0.75, "reward_std": 0.1246790736913681, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 406 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 9.199522104181912e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 1.1691756272401435, "grad_norm": 0.06823624130547844, "learning_rate": 1.8974525820099605e-06, "loss": -0.0, "num_tokens": 35100593.0, "reward": 0.6015625, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4915000796318054, "step": 407 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.1720430107526882, "grad_norm": 0.0809172892703687, "learning_rate": 1.8969556705376518e-06, "loss": -0.0, "num_tokens": 35174411.0, "reward": 0.765625, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 408 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6721682514236524e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 1.174910394265233, "grad_norm": 0.15140274321080863, "learning_rate": 1.896457623427512e-06, "loss": -0.0, "num_tokens": 35264135.0, "reward": 0.640625, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 409 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917221686896894e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.1777777777777778, "grad_norm": 0.11361003928383843, "learning_rate": 1.8959584413101206e-06, "loss": -0.0, "num_tokens": 35351533.0, "reward": 0.6796875, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 410 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 1.1266523706756892e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.1806451612903226, "grad_norm": 0.04996767911029415, "learning_rate": 1.8954581248174925e-06, "loss": 0.0, "num_tokens": 35432543.0, "reward": 0.7421875, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 411 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.1835125448028674, "grad_norm": 0.076903562461185, "learning_rate": 1.8949566745830801e-06, "loss": 0.0, "num_tokens": 35510747.0, "reward": 0.6171875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.4879830479621887, "step": 412 }, { "advantages/mean": -8.381903171539307e-09, "advantages/snr": 1.2675146677787739e-08, "advantages/std": 0.661286473274231, "advantages/var": 0.4372997997354702, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.1863799283154122, "grad_norm": 0.12710704937627615, "learning_rate": 1.8944540912417708e-06, "loss": 0.0, "num_tokens": 35598106.0, "reward": 0.734375, "reward_std": 0.22567126154899597, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 413 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.0954488684246125e-08, "advantages/std": 0.4675959050655365, "advantages/var": 0.21864593043405822, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.189247311827957, "grad_norm": 0.08600713301720207, "learning_rate": 1.8939503754298865e-06, "loss": 0.0, "num_tokens": 35680882.0, "reward": 0.8125, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 414 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983450684521008e-09, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.1921146953405017, "grad_norm": 0.0961858939890946, "learning_rate": 1.893445527785183e-06, "loss": 0.0, "num_tokens": 35755324.0, "reward": 0.859375, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 415 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.703125, "epoch": 1.1949820788530465, "grad_norm": 0.060517150584917266, "learning_rate": 1.8929395489468494e-06, "loss": 0.0, "num_tokens": 35840539.0, "reward": 0.6796875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 416 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 1.1978494623655913, "grad_norm": 0.07215171444096555, "learning_rate": 1.8924324395555066e-06, "loss": 0.0, "num_tokens": 35915811.0, "reward": 0.59375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.59375, "rewards/drgrpo_math_reward/std": 0.4930621087551117, "step": 417 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 8.049402934764073e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.2007168458781363, "grad_norm": 0.24810926566863736, "learning_rate": 1.891924200253207e-06, "loss": -0.0, "num_tokens": 35998688.0, "reward": 0.71875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 418 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.563002308053355e-09, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.2035842293906809, "grad_norm": 0.09293770477831811, "learning_rate": 1.8914148316834337e-06, "loss": -0.0, "num_tokens": 36086737.0, "reward": 0.6484375, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 419 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199094228701277e-09, "advantages/std": 0.4049627184867859, "advantages/var": 0.1639948033642078, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 1.206451612903226, "grad_norm": 0.0855377069771544, "learning_rate": 1.8909043344911e-06, "loss": 0.0, "num_tokens": 36170106.0, "reward": 0.5625, "reward_std": 0.09127141535282135, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49802759289741516, "step": 420 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344503462080032e-09, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.2093189964157707, "grad_norm": 0.08230445158939939, "learning_rate": 1.890392709322547e-06, "loss": 0.0, "num_tokens": 36242565.0, "reward": 0.7734375, "reward_std": 0.12073517590761185, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 421 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.504923957817997e-09, "advantages/std": 0.5726877450942993, "advantages/var": 0.32797125338119315, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.2121863799283155, "grad_norm": 0.17186986086914186, "learning_rate": 1.889879956825545e-06, "loss": 0.0, "num_tokens": 36330470.0, "reward": 0.6953125, "reward_std": 0.1633366346359253, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 422 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.2150537634408602, "grad_norm": 0.0892643304423356, "learning_rate": 1.8893660776492911e-06, "loss": -0.0, "num_tokens": 36401660.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 423 }, { "advantages/mean": 6.51925802230835e-09, "advantages/snr": 1.1383827823242494e-08, "advantages/std": 0.5726771354675293, "advantages/var": 0.3279591014872949, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 1.217921146953405, "grad_norm": 0.11507868659846966, "learning_rate": 1.8888510724444092e-06, "loss": -0.0, "num_tokens": 36483608.0, "reward": 0.703125, "reward_std": 0.1530819833278656, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 424 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.504949684853452e-09, "advantages/std": 0.5726854801177979, "advantages/var": 0.32796865913775264, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.2207885304659498, "grad_norm": 0.09813999306223548, "learning_rate": 1.8883349418629485e-06, "loss": -0.0, "num_tokens": 36576834.0, "reward": 0.671875, "reward_std": 0.15992169082164764, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 425 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.90714930920747e-09, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.2236559139784946, "grad_norm": 0.10870643147935784, "learning_rate": 1.8878176865583831e-06, "loss": 0.0, "num_tokens": 36661457.0, "reward": 0.734375, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 426 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.2265232974910394, "grad_norm": 0.07322473260223201, "learning_rate": 1.8872993071856112e-06, "loss": -0.0, "num_tokens": 36752775.0, "reward": 0.625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 427 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.2293906810035842, "grad_norm": 0.0673104946087146, "learning_rate": 1.8867798044009546e-06, "loss": -0.0, "num_tokens": 36826486.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 428 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 1.232258064516129, "grad_norm": 0.060029529820155615, "learning_rate": 1.886259178862157e-06, "loss": 0.0, "num_tokens": 36920849.0, "reward": 0.6171875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.4879830479621887, "step": 429 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.379866977655094e-08, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.2351254480286737, "grad_norm": 0.12504837454470752, "learning_rate": 1.8857374312283835e-06, "loss": -0.0, "num_tokens": 36989818.0, "reward": 0.8828125, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 430 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.9753001796492024e-09, "advantages/std": 0.4675861597061157, "advantages/var": 0.21863681674871316, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.2379928315412188, "grad_norm": 0.06734258077817594, "learning_rate": 1.8852145621602204e-06, "loss": 0.0, "num_tokens": 37070613.0, "reward": 0.6953125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 431 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 7.318093530282481e-09, "advantages/std": 0.5726835131645203, "advantages/var": 0.32796640625045725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.2408602150537635, "grad_norm": 0.10759439920719689, "learning_rate": 1.8846905723196732e-06, "loss": 0.0, "num_tokens": 37151115.0, "reward": 0.7890625, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 432 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.562795544129766e-09, "advantages/std": 0.5228043794631958, "advantages/var": 0.2733244191858972, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.2437275985663083, "grad_norm": 0.12803270673421868, "learning_rate": 1.8841654623701671e-06, "loss": -0.0, "num_tokens": 37239984.0, "reward": 0.71875, "reward_std": 0.1514892876148224, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 433 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.131306434342512e-10, "advantages/std": 0.5726770758628845, "advantages/var": 0.327959033218864, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.246594982078853, "grad_norm": 0.11317202504210273, "learning_rate": 1.8836392329765448e-06, "loss": -0.0, "num_tokens": 37321224.0, "reward": 0.765625, "reward_std": 0.1530819833278656, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 434 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2525191884915387e-09, "advantages/std": 0.5726776719093323, "advantages/var": 0.3279597159034928, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.249462365591398, "grad_norm": 0.15134775700601374, "learning_rate": 1.8831118848050666e-06, "loss": 0.0, "num_tokens": 37390443.0, "reward": 0.8671875, "reward_std": 0.154142826795578, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 435 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562923093105361e-09, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.2523297491039427, "grad_norm": 0.09403764676074379, "learning_rate": 1.8825834185234098e-06, "loss": 0.0, "num_tokens": 37476406.0, "reward": 0.734375, "reward_std": 0.13098981976509094, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 436 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.225143719438781e-09, "advantages/std": 0.6612716317176819, "advantages/var": 0.4372801709145655, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.2551971326164875, "grad_norm": 0.11909031388211558, "learning_rate": 1.8820538348006666e-06, "loss": 0.0, "num_tokens": 37572061.0, "reward": 0.609375, "reward_std": 0.20517179369926453, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.4898075461387634, "step": 437 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149905988405531e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 1.2580645161290323, "grad_norm": 0.09797707756153493, "learning_rate": 1.8815231343073444e-06, "loss": -0.0, "num_tokens": 37660072.0, "reward": 0.5703125, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.5703125, "rewards/drgrpo_math_reward/std": 0.4969765841960907, "step": 438 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.5167326511663e-09, "advantages/std": 0.6185815930366516, "advantages/var": 0.3826431872437617, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.260931899641577, "grad_norm": 0.11722912957013189, "learning_rate": 1.880991317715364e-06, "loss": -0.0, "num_tokens": 37753098.0, "reward": 0.7265625, "reward_std": 0.20357418060302734, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 439 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.504965933612274e-09, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.2637992831541218, "grad_norm": 0.13012720565928393, "learning_rate": 1.8804583856980603e-06, "loss": 0.0, "num_tokens": 37828784.0, "reward": 0.90625, "reward_std": 0.16097761690616608, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 440 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 8.45014868788346e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.2666666666666666, "grad_norm": 0.06212992035853581, "learning_rate": 1.8799243389301796e-06, "loss": 0.0, "num_tokens": 37897771.0, "reward": 0.8359375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 441 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814751550759118e-09, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 1.2695340501792114, "grad_norm": 0.06849675578675044, "learning_rate": 1.8793891780878798e-06, "loss": -0.0, "num_tokens": 37972188.0, "reward": 0.8359375, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 442 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.562891001898203e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 1.2724014336917562, "grad_norm": 0.07792881497712156, "learning_rate": 1.8788529038487296e-06, "loss": 0.0, "num_tokens": 38064377.0, "reward": 0.59375, "reward_std": 0.13781969249248505, "rewards/drgrpo_math_reward/mean": 0.59375, "rewards/drgrpo_math_reward/std": 0.4930621087551117, "step": 443 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5629340611188405e-09, "advantages/std": 0.5227840542793274, "advantages/var": 0.27330316740873073, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.2752688172043012, "grad_norm": 0.10833904988467241, "learning_rate": 1.8783155168917068e-06, "loss": 0.0, "num_tokens": 38145860.0, "reward": 0.734375, "reward_std": 0.13204573094844818, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 444 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.494855892459685e-08, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.2781362007168457, "grad_norm": 0.07476540605174994, "learning_rate": 1.8777770178971987e-06, "loss": 0.0, "num_tokens": 38227493.0, "reward": 0.7578125, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 445 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.516761811003565e-09, "advantages/std": 0.6185775995254517, "advantages/var": 0.38263824663467005, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.2810035842293908, "grad_norm": 0.1370815896913483, "learning_rate": 1.8772374075470006e-06, "loss": -0.0, "num_tokens": 38324888.0, "reward": 0.5390625, "reward_std": 0.19674429297447205, "rewards/drgrpo_math_reward/mean": 0.5390625, "rewards/drgrpo_math_reward/std": 0.5004304051399231, "step": 446 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.781509278854418e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.2838709677419355, "grad_norm": 0.0993472149618786, "learning_rate": 1.8766966865243136e-06, "loss": 0.0, "num_tokens": 38410186.0, "reward": 0.796875, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 447 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.131154947272497e-09, "advantages/std": 0.5726877450942993, "advantages/var": 0.32797125338119315, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.2867383512544803, "grad_norm": 0.1029740923633324, "learning_rate": 1.8761548555137466e-06, "loss": 0.0, "num_tokens": 38498004.0, "reward": 0.7421875, "reward_std": 0.1633366346359253, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 448 }, { "advantages/mean": 5.587935447692871e-09, "advantages/snr": 1.1950401561551717e-08, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.2896057347670251, "grad_norm": 0.09605681530603226, "learning_rate": 1.875611915201313e-06, "loss": 0.0, "num_tokens": 38577923.0, "reward": 0.765625, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 449 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.970882307601416e-09, "advantages/std": 0.4676063656806946, "advantages/var": 0.21865571322510746, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.29247311827957, "grad_norm": 0.09269184158527434, "learning_rate": 1.8750678662744308e-06, "loss": 0.0, "num_tokens": 38658214.0, "reward": 0.78125, "reward_std": 0.11913755536079407, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 450 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.1383531621724775e-08, "advantages/std": 0.5726920366287231, "advantages/var": 0.32797616881795477, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "epoch": 1.2953405017921147, "grad_norm": 0.12109369035741321, "learning_rate": 1.8745227094219218e-06, "loss": 0.0, "num_tokens": 38745225.0, "reward": 0.6640625, "reward_std": 0.1701665222644806, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 451 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125678014490734e-09, "advantages/std": 0.5227980017662048, "advantages/var": 0.2733177506507367, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.2982078853046595, "grad_norm": 0.08139897608330982, "learning_rate": 1.8739764453340107e-06, "loss": 0.0, "num_tokens": 38829061.0, "reward": 0.90625, "reward_std": 0.14571532607078552, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 452 }, { "advantages/mean": 5.587935447692871e-09, "advantages/snr": 1.379887186086637e-08, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.3010752688172043, "grad_norm": 0.08981833431419232, "learning_rate": 1.8734290747023237e-06, "loss": -0.0, "num_tokens": 38908410.0, "reward": 0.8046875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 453 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016504754270957e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.303942652329749, "grad_norm": 0.12137991283167442, "learning_rate": 1.8728805982198877e-06, "loss": 0.0, "num_tokens": 38993045.0, "reward": 0.6875, "reward_std": 0.13781969249248505, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 454 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 8.944303023925922e-09, "advantages/std": 0.5726856589317322, "advantages/var": 0.3279688639460723, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.3068100358422938, "grad_norm": 0.11385847015754162, "learning_rate": 1.8723310165811308e-06, "loss": 0.0, "num_tokens": 39074152.0, "reward": 0.890625, "reward_std": 0.16333173215389252, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 455 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.1498803472501986e-09, "advantages/std": 0.40496498346328735, "advantages/var": 0.1639966378314206, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.3096774193548386, "grad_norm": 0.08996537497957356, "learning_rate": 1.8717803304818794e-06, "loss": 0.0, "num_tokens": 39162360.0, "reward": 0.796875, "reward_std": 0.0936255231499672, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 456 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966610933966542e-09, "advantages/std": 0.4676129221916199, "advantages/var": 0.21866184500058594, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 1.3125448028673836, "grad_norm": 0.08467746260660809, "learning_rate": 1.8712285406193585e-06, "loss": 0.0, "num_tokens": 39247316.0, "reward": 0.84375, "reward_std": 0.1246790662407875, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 457 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.3278566809822465e-09, "advantages/std": 0.7013728022575378, "advantages/var": 0.4919238077465913, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.3154121863799282, "grad_norm": 0.14786741187405614, "learning_rate": 1.8706756476921907e-06, "loss": 0.0, "num_tokens": 39335135.0, "reward": 0.6015625, "reward_std": 0.21254336833953857, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4915000796318054, "step": 458 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562869878887461e-09, "advantages/std": 0.5227934718132019, "advantages/var": 0.27331301417050113, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.3182795698924732, "grad_norm": 0.11689445554737352, "learning_rate": 1.8701216524003953e-06, "loss": 0.0, "num_tokens": 39424066.0, "reward": 0.65625, "reward_std": 0.1422954797744751, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 459 }, { "advantages/mean": 7.450580596923828e-09, "advantages/snr": 1.2044824669190209e-08, "advantages/std": 0.6185711026191711, "advantages/var": 0.38263020899549716, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 1.321146953405018, "grad_norm": 0.09807668103246633, "learning_rate": 1.8695665554453868e-06, "loss": -0.0, "num_tokens": 39508401.0, "reward": 0.7109375, "reward_std": 0.18884865939617157, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 460 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.4536934733273465e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.3240143369175628, "grad_norm": 0.06544111566552296, "learning_rate": 1.8690103575299752e-06, "loss": 0.0, "num_tokens": 39593601.0, "reward": 0.75, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 461 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.2196958382045529e-08, "advantages/std": 0.5726771354675293, "advantages/var": 0.3279591014872949, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.3268817204301075, "grad_norm": 0.12305107284888928, "learning_rate": 1.8684530593583636e-06, "loss": 0.0, "num_tokens": 39670753.0, "reward": 0.796875, "reward_std": 0.1530819982290268, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 462 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.3444254652277355e-09, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.3297491039426523, "grad_norm": 0.12313119220773869, "learning_rate": 1.867894661636149e-06, "loss": -0.0, "num_tokens": 39757110.0, "reward": 0.6640625, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 463 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.572691023349762, "advantages/var": 0.3279750082253976, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.3326164874551971, "grad_norm": 0.11870701325721736, "learning_rate": 1.8673351650703201e-06, "loss": 0.0, "num_tokens": 39837325.0, "reward": 0.75, "reward_std": 0.16887323558330536, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 464 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.335483870967742, "grad_norm": 0.0, "learning_rate": 1.866774570369257e-06, "loss": 0.0, "num_tokens": 39916341.0, "reward": 0.8125, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 465 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6185612082481384, "advantages/var": 0.3826179683493969, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.3383512544802867, "grad_norm": 0.1028369035844959, "learning_rate": 1.8662128782427297e-06, "loss": 0.0, "num_tokens": 40007964.0, "reward": 0.6875, "reward_std": 0.1751839816570282, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 466 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2524501310221626e-09, "advantages/std": 0.5726898312568665, "advantages/var": 0.3279736428250182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.3412186379928315, "grad_norm": 0.14570376248276248, "learning_rate": 1.8656500894018986e-06, "loss": -0.0, "num_tokens": 40101423.0, "reward": 0.765625, "reward_std": 0.16675159335136414, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 467 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917221686896894e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 1.3440860215053765, "grad_norm": 0.14877932191107487, "learning_rate": 1.8650862045593114e-06, "loss": 0.0, "num_tokens": 40184237.0, "reward": 0.6640625, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 468 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.1950220288145723e-08, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.346953405017921, "grad_norm": 0.08974549401489106, "learning_rate": 1.8645212244289047e-06, "loss": 0.0, "num_tokens": 40263229.0, "reward": 0.875, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 469 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.599556592183647e-09, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.349820788530466, "grad_norm": 0.08507901581401366, "learning_rate": 1.8639551497260007e-06, "loss": 0.0, "num_tokens": 40349793.0, "reward": 0.6796875, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 470 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.979341600636731e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 1.3526881720430106, "grad_norm": 0.08376628223431468, "learning_rate": 1.8633879811673086e-06, "loss": 0.0, "num_tokens": 40424175.0, "reward": 0.7578125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 471 }, { "advantages/mean": -7.916241884231567e-09, "advantages/snr": 1.1286513223603052e-08, "advantages/std": 0.701389491558075, "advantages/var": 0.4919472188680949, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 1.3555555555555556, "grad_norm": 0.1185452237370237, "learning_rate": 1.8628197194709213e-06, "loss": 0.0, "num_tokens": 40514516.0, "reward": 0.65625, "reward_std": 0.23857945203781128, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 472 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.3584229390681004, "grad_norm": 0.09299044822566034, "learning_rate": 1.862250365356317e-06, "loss": -0.0, "num_tokens": 40605730.0, "reward": 0.6328125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 473 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726904273033142, "advantages/var": 0.3279743255248526, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 1.3612903225806452, "grad_norm": 0.1015467554994938, "learning_rate": 1.8616799195443563e-06, "loss": -0.0, "num_tokens": 40690416.0, "reward": 0.7265625, "reward_std": 0.16781240701675415, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 474 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.8788021410185465e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 1.36415770609319, "grad_norm": 0.12127591444840169, "learning_rate": 1.8611083827572815e-06, "loss": 0.0, "num_tokens": 40782303.0, "reward": 0.8203125, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 475 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.504917864602565e-09, "advantages/std": 0.5726882815361023, "advantages/var": 0.32797186780877396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.3670250896057348, "grad_norm": 0.11257977885529864, "learning_rate": 1.8605357557187172e-06, "loss": 0.0, "num_tokens": 40868256.0, "reward": 0.75, "reward_std": 0.1643974632024765, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 476 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.3698924731182796, "grad_norm": 0.09588521532557784, "learning_rate": 1.859962039153668e-06, "loss": 0.0, "num_tokens": 40950899.0, "reward": 0.6875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 477 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.878747807970186e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.3727598566308243, "grad_norm": 0.10874589141704576, "learning_rate": 1.8593872337885175e-06, "loss": -0.0, "num_tokens": 41034717.0, "reward": 0.703125, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 478 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899641578136434e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.3756272401433691, "grad_norm": 0.07697473222804034, "learning_rate": 1.8588113403510286e-06, "loss": 0.0, "num_tokens": 41118625.0, "reward": 0.9140625, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 479 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 1.378494623655914, "grad_norm": 0.10762834890644403, "learning_rate": 1.8582343595703414e-06, "loss": -0.0, "num_tokens": 41200803.0, "reward": 0.703125, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 480 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 1.381362007168459, "grad_norm": 0.0867693183099241, "learning_rate": 1.8576562921769726e-06, "loss": -0.0, "num_tokens": 41286801.0, "reward": 0.6875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 481 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.131297971228846e-09, "advantages/std": 0.5726776719093323, "advantages/var": 0.3279597159034928, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.3842293906810035, "grad_norm": 0.15292227824201848, "learning_rate": 1.8570771389028148e-06, "loss": -0.0, "num_tokens": 41371319.0, "reward": 0.7890625, "reward_std": 0.154142826795578, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 482 }, { "advantages/mean": -1.0244548320770264e-08, "advantages/snr": 1.5491760762766434e-08, "advantages/std": 0.6612901091575623, "advantages/var": 0.4373046084696206, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.3870967741935485, "grad_norm": 0.14366149758916022, "learning_rate": 1.8564969004811354e-06, "loss": 0.0, "num_tokens": 41460754.0, "reward": 0.609375, "reward_std": 0.2290911078453064, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.4898075461387634, "step": 483 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299807237755752e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.3899641577060933, "grad_norm": 0.07126110064477205, "learning_rate": 1.8559155776465756e-06, "loss": -0.0, "num_tokens": 41545261.0, "reward": 0.734375, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 484 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.971194650918909e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.392831541218638, "grad_norm": 0.11155526099221504, "learning_rate": 1.8553331711351498e-06, "loss": 0.0, "num_tokens": 41628232.0, "reward": 0.84375, "reward_std": 0.09863808006048203, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 485 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 8.907056898068929e-09, "advantages/std": 0.5228003859519958, "advantages/var": 0.2733202435515558, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.3956989247311828, "grad_norm": 0.13006837864211654, "learning_rate": 1.8547496816842446e-06, "loss": -0.0, "num_tokens": 41710869.0, "reward": 0.6484375, "reward_std": 0.14913025498390198, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 486 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.75753319179351e-09, "advantages/std": 0.5726791024208069, "advantages/var": 0.327961354349501, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.3985663082437276, "grad_norm": 0.1012756121633877, "learning_rate": 1.8541651100326172e-06, "loss": 0.0, "num_tokens": 41799936.0, "reward": 0.8359375, "reward_std": 0.15308690071105957, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 487 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262475767256781e-09, "advantages/std": 0.5726819038391113, "advantages/var": 0.32796456298478915, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.4014336917562724, "grad_norm": 0.13748119752207785, "learning_rate": 1.853579456920395e-06, "loss": 0.0, "num_tokens": 41878924.0, "reward": 0.8046875, "reward_std": 0.15756267309188843, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 488 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983473534398352e-09, "advantages/std": 0.4675931930541992, "advantages/var": 0.21864339419062162, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.4043010752688172, "grad_norm": 0.10745931360439237, "learning_rate": 1.8529927230890755e-06, "loss": 0.0, "num_tokens": 41951782.0, "reward": 0.8203125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 489 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 1.9917227329728737e-09, "advantages/std": 0.7013947367668152, "advantages/var": 0.49195457676418997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.407168458781362, "grad_norm": 0.15995655732792285, "learning_rate": 1.8524049092815236e-06, "loss": -0.0, "num_tokens": 42036663.0, "reward": 0.7578125, "reward_std": 0.24541422724723816, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 490 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.691702426092771e-09, "advantages/std": 0.5726984143257141, "advantages/var": 0.3279834737711873, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.4100358422939068, "grad_norm": 0.11393319820939447, "learning_rate": 1.8518160162419718e-06, "loss": 0.0, "num_tokens": 42120967.0, "reward": 0.796875, "reward_std": 0.17700131237506866, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 491 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.4083418875242214e-09, "advantages/std": 0.6612901091575623, "advantages/var": 0.4373046084696206, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.4129032258064516, "grad_norm": 0.1200229835178394, "learning_rate": 1.8512260447160187e-06, "loss": 0.0, "num_tokens": 42210381.0, "reward": 0.5625, "reward_std": 0.2290911078453064, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49802759289741516, "step": 492 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 8.280895157741607e-09, "advantages/std": 0.6185652613639832, "advantages/var": 0.3826229825662928, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.4157706093189963, "grad_norm": 0.13720668328810662, "learning_rate": 1.8506349954506297e-06, "loss": 0.0, "num_tokens": 42291297.0, "reward": 0.875, "reward_std": 0.1820138841867447, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 493 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.527983501262913e-10, "advantages/std": 0.6185737252235413, "advantages/var": 0.3826334535369291, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 1.4186379928315414, "grad_norm": 0.13930169565034922, "learning_rate": 1.8500428691941328e-06, "loss": 0.0, "num_tokens": 42381129.0, "reward": 0.6640625, "reward_std": 0.19332444667816162, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 494 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.421505376344086, "grad_norm": 0.10696979494855548, "learning_rate": 1.8494496666962206e-06, "loss": -0.0, "num_tokens": 42465657.0, "reward": 0.796875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 495 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983495876754114e-09, "advantages/std": 0.4675905704498291, "advantages/var": 0.2186409415735966, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.424372759856631, "grad_norm": 0.07145280671923458, "learning_rate": 1.848855388707949e-06, "loss": 0.0, "num_tokens": 42547630.0, "reward": 0.859375, "reward_std": 0.10205792635679245, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 496 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 6.234939117800795e-09, "advantages/std": 0.5228004455566406, "advantages/var": 0.27332030587422196, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.4272401433691757, "grad_norm": 0.09676791260047765, "learning_rate": 1.8482600359817342e-06, "loss": 0.0, "num_tokens": 42627615.0, "reward": 0.8359375, "reward_std": 0.14913025498390198, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 497 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.0112444683242622e-09, "advantages/std": 0.6185632348060608, "advantages/var": 0.3826204754537379, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.4301075268817205, "grad_norm": 0.14074182529916074, "learning_rate": 1.847663609271354e-06, "loss": 0.0, "num_tokens": 42719557.0, "reward": 0.7421875, "reward_std": 0.17859892547130585, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 498 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975300560494157e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.4329749103942653, "grad_norm": 0.10017823898476097, "learning_rate": 1.847066109331946e-06, "loss": 0.0, "num_tokens": 42800302.0, "reward": 0.8203125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 499 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6185612082481384, "advantages/var": 0.3826179683493969, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.43584229390681, "grad_norm": 0.1179838232391151, "learning_rate": 1.8464675369200057e-06, "loss": 0.0, "num_tokens": 42888803.0, "reward": 0.609375, "reward_std": 0.1751839816570282, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.4898075461387634, "step": 500 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.5629027821475993e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.4387096774193548, "grad_norm": 0.09646195424819327, "learning_rate": 1.8458678927933882e-06, "loss": 0.0, "num_tokens": 42979611.0, "reward": 0.609375, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.4898075461387634, "step": 501 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.4415770609318996, "grad_norm": 0.05897016915946197, "learning_rate": 1.8452671777113033e-06, "loss": 0.0, "num_tokens": 43051799.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 502 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599658819865184e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.4444444444444444, "grad_norm": 0.09539664660420902, "learning_rate": 1.8446653924343188e-06, "loss": -0.0, "num_tokens": 43130374.0, "reward": 0.75, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 503 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.8994882297977766e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.4473118279569892, "grad_norm": 0.09780941279649633, "learning_rate": 1.8440625377243557e-06, "loss": -0.0, "num_tokens": 43211497.0, "reward": 0.84375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 504 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.991766726549734e-09, "advantages/std": 0.4675861597061157, "advantages/var": 0.21863681674871316, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.450179211469534, "grad_norm": 0.11666079982718623, "learning_rate": 1.8434586143446905e-06, "loss": 0.0, "num_tokens": 43289312.0, "reward": 0.7890625, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 505 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.4530465949820788, "grad_norm": 0.049559262838082686, "learning_rate": 1.842853623059952e-06, "loss": 0.0, "num_tokens": 43367438.0, "reward": 0.9453125, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 506 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.4559139784946238, "grad_norm": 0.05497588612217425, "learning_rate": 1.8422475646361208e-06, "loss": -0.0, "num_tokens": 43440860.0, "reward": 0.7578125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 507 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 8.049325332145131e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 1.4587813620071683, "grad_norm": 0.06195988661538175, "learning_rate": 1.8416404398405296e-06, "loss": 0.0, "num_tokens": 43533713.0, "reward": 0.75, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 508 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983628920516591e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.4616487455197134, "grad_norm": 0.1537818005926993, "learning_rate": 1.8410322494418603e-06, "loss": 0.0, "num_tokens": 43604263.0, "reward": 0.796875, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 509 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.4645161290322581, "grad_norm": 0.0, "learning_rate": 1.8404229942101442e-06, "loss": 0.0, "num_tokens": 43679233.0, "reward": 0.875, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 510 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958683201273463e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.467383512544803, "grad_norm": 0.16557840487058936, "learning_rate": 1.8398126749167613e-06, "loss": 0.0, "num_tokens": 43759594.0, "reward": 0.7578125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 511 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.0111388527025745e-09, "advantages/std": 0.618584930896759, "advantages/var": 0.38264731673254815, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.4702508960573477, "grad_norm": 0.11408788231286815, "learning_rate": 1.8392012923344378e-06, "loss": -0.0, "num_tokens": 43852880.0, "reward": 0.6875, "reward_std": 0.20593319833278656, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 512 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 9.154460730631332e-09, "advantages/std": 0.6612728834152222, "advantages/var": 0.437281826340282, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.4731182795698925, "grad_norm": 0.21363207577931034, "learning_rate": 1.838588847237247e-06, "loss": 0.0, "num_tokens": 43924952.0, "reward": 0.75, "reward_std": 0.2041158676147461, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 513 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917813257124117e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.4759856630824373, "grad_norm": 0.10296469008522874, "learning_rate": 1.8379753404006073e-06, "loss": -0.0, "num_tokens": 44006988.0, "reward": 0.8359375, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 514 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.2583373839143607e-09, "advantages/std": 0.6185895204544067, "advantages/var": 0.3826529948160129, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.478853046594982, "grad_norm": 0.11876060878737717, "learning_rate": 1.8373607726012811e-06, "loss": -0.0, "num_tokens": 44084728.0, "reward": 0.6796875, "reward_std": 0.21382391452789307, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 515 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.034946312888557e-08, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.4817204301075269, "grad_norm": 0.08411429785495499, "learning_rate": 1.8367451446173746e-06, "loss": 0.0, "num_tokens": 44165115.0, "reward": 0.6484375, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 516 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.4845878136200716, "grad_norm": 0.05578709283562855, "learning_rate": 1.8361284572283354e-06, "loss": 0.0, "num_tokens": 44246006.0, "reward": 0.7578125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 517 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016755193120049e-09, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.4874551971326164, "grad_norm": 0.11364701408577403, "learning_rate": 1.835510711214953e-06, "loss": 0.0, "num_tokens": 44322867.0, "reward": 0.8203125, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 518 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.562954778661877e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 1.4903225806451612, "grad_norm": 0.10914034535598677, "learning_rate": 1.8348919073593575e-06, "loss": 0.0, "num_tokens": 44414575.0, "reward": 0.78125, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 519 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 8.450170142019348e-09, "advantages/std": 0.6612808108329773, "advantages/var": 0.4372923107759199, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 1.4931899641577062, "grad_norm": 0.12566288606330955, "learning_rate": 1.834272046445018e-06, "loss": 0.0, "num_tokens": 44516535.0, "reward": 0.7109375, "reward_std": 0.21542643010616302, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 520 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.50497202691776e-09, "advantages/std": 0.5726835131645203, "advantages/var": 0.32796640625045725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.4960573476702508, "grad_norm": 0.08963705937595029, "learning_rate": 1.8336511292567418e-06, "loss": -0.0, "num_tokens": 44596646.0, "reward": 0.8359375, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 521 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.1383871662376608e-08, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.4989247311827958, "grad_norm": 0.12113829535017172, "learning_rate": 1.8330291565806734e-06, "loss": 0.0, "num_tokens": 44673357.0, "reward": 0.8984375, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 522 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299846843318419e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.5017921146953404, "grad_norm": 0.07934921760227831, "learning_rate": 1.832406129204295e-06, "loss": -0.0, "num_tokens": 44764356.0, "reward": 0.7578125, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 523 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.022546389467422e-09, "advantages/std": 0.618557333946228, "advantages/var": 0.38261317537866546, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 1.5046594982078854, "grad_norm": 0.13005206333052863, "learning_rate": 1.8317820479164219e-06, "loss": -0.0, "num_tokens": 44856882.0, "reward": 0.765625, "reward_std": 0.17176413536071777, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 524 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.987538125611118e-09, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.5075268817204301, "grad_norm": 0.08623005065735916, "learning_rate": 1.8311569135072059e-06, "loss": 0.0, "num_tokens": 44937742.0, "reward": 0.6953125, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 525 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.75738695226396e-09, "advantages/std": 0.5726876854896545, "advantages/var": 0.3279711851114975, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.510394265232975, "grad_norm": 0.1094564783292251, "learning_rate": 1.8305307267681305e-06, "loss": 0.0, "num_tokens": 45017324.0, "reward": 0.8046875, "reward_std": 0.1633366346359253, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 526 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.4497179652165926e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.5132616487455197, "grad_norm": 0.1083477319515736, "learning_rate": 1.8299034884920128e-06, "loss": -0.0, "num_tokens": 45088340.0, "reward": 0.6953125, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 527 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.453569069264198e-09, "advantages/std": 0.5227956175804138, "advantages/var": 0.2733152577612863, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.5161290322580645, "grad_norm": 0.10553993076350929, "learning_rate": 1.829275199473001e-06, "loss": 0.0, "num_tokens": 45174127.0, "reward": 0.7890625, "reward_std": 0.14230038225650787, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 528 }, { "advantages/mean": 5.122274160385132e-09, "advantages/snr": 1.0954471927793293e-08, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.5189964157706093, "grad_norm": 0.07736508690404215, "learning_rate": 1.8286458605065728e-06, "loss": 0.0, "num_tokens": 45258212.0, "reward": 0.6171875, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.4879830479621887, "step": 529 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907216333870301e-10, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.521863799283154, "grad_norm": 0.12359899648958964, "learning_rate": 1.828015472389536e-06, "loss": 0.0, "num_tokens": 45342083.0, "reward": 0.7734375, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 530 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.011196882346898e-09, "advantages/std": 0.618573009967804, "advantages/var": 0.3826325686606289, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.524731182795699, "grad_norm": 0.1403189014289581, "learning_rate": 1.827384035920027e-06, "loss": -0.0, "num_tokens": 45426887.0, "reward": 0.640625, "reward_std": 0.18885356187820435, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 531 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.504949684853452e-09, "advantages/std": 0.5726854801177979, "advantages/var": 0.32796865913775264, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 1.5275985663082436, "grad_norm": 0.1282566540666401, "learning_rate": 1.8267515518975086e-06, "loss": -0.0, "num_tokens": 45516431.0, "reward": 0.640625, "reward_std": 0.15992169082164764, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 532 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.9752099207640785e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.5304659498207887, "grad_norm": 0.12702320164205028, "learning_rate": 1.8261180211227707e-06, "loss": -0.0, "num_tokens": 45580640.0, "reward": 0.8203125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 533 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6721527138226332e-09, "advantages/std": 0.5227934122085571, "advantages/var": 0.27331295184866633, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.5333333333333332, "grad_norm": 0.18715919069910997, "learning_rate": 1.825483444397928e-06, "loss": -0.0, "num_tokens": 45663240.0, "reward": 0.65625, "reward_std": 0.1422954797744751, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 534 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.2525831708228704e-09, "advantages/std": 0.5726664066314697, "advantages/var": 0.32794681328419983, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 1.5362007168458782, "grad_norm": 0.16291162730230088, "learning_rate": 1.8248478225264199e-06, "loss": -0.0, "num_tokens": 45741546.0, "reward": 0.8203125, "reward_std": 0.13941730558872223, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 535 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.5390681003584228, "grad_norm": 0.13444607915853476, "learning_rate": 1.8242111563130088e-06, "loss": 0.0, "num_tokens": 45818648.0, "reward": 0.734375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 536 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 6.970911630250105e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 1.5419354838709678, "grad_norm": 0.08328740955067904, "learning_rate": 1.8235734465637792e-06, "loss": 0.0, "num_tokens": 45903903.0, "reward": 0.8125, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 537 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.96694656101877e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.5448028673835126, "grad_norm": 0.11540333279554736, "learning_rate": 1.8229346940861373e-06, "loss": 0.0, "num_tokens": 45981684.0, "reward": 0.8359375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 538 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.065607939970255e-09, "advantages/std": 0.5726834535598755, "advantages/var": 0.32796633798126607, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.5476702508960574, "grad_norm": 0.1008218760985725, "learning_rate": 1.822294899688809e-06, "loss": -0.0, "num_tokens": 46068728.0, "reward": 0.7734375, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 539 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878579738631987e-09, "advantages/std": 0.5727010369300842, "advantages/var": 0.3279864777007937, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 1.5505376344086022, "grad_norm": 0.11497320202059863, "learning_rate": 1.8216540641818399e-06, "loss": -0.0, "num_tokens": 46162825.0, "reward": 0.609375, "reward_std": 0.17806705832481384, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.4898075461387634, "step": 540 }, { "advantages/mean": -8.381903171539307e-09, "advantages/snr": 1.3550341577173055e-08, "advantages/std": 0.6185750365257263, "advantages/var": 0.38263507581280365, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.553405017921147, "grad_norm": 0.1890697733491184, "learning_rate": 1.821012188376593e-06, "loss": 0.0, "num_tokens": 46255325.0, "reward": 0.6796875, "reward_std": 0.1922685205936432, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 541 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.5562724014336917, "grad_norm": 0.09357868073770031, "learning_rate": 1.8203692730857492e-06, "loss": 0.0, "num_tokens": 46332842.0, "reward": 0.921875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 542 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.907505770133387e-10, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 1.5591397849462365, "grad_norm": 0.10648952831676473, "learning_rate": 1.819725319123305e-06, "loss": 0.0, "num_tokens": 46418299.0, "reward": 0.6953125, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 543 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.8996420859237135e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.5620071684587815, "grad_norm": 0.13346921631545694, "learning_rate": 1.8190803273045723e-06, "loss": 0.0, "num_tokens": 46495675.0, "reward": 0.8359375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 544 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.564874551971326, "grad_norm": 0.13632501005664457, "learning_rate": 1.8184342984461764e-06, "loss": 0.0, "num_tokens": 46580431.0, "reward": 0.703125, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 545 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983450684521008e-09, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.567741935483871, "grad_norm": 0.16067957718336778, "learning_rate": 1.8177872333660569e-06, "loss": 0.0, "num_tokens": 46665618.0, "reward": 0.828125, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 546 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.4082993428404723e-08, "advantages/std": 0.330655038356781, "advantages/var": 0.10933275439072432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.5706093189964156, "grad_norm": 0.08184278908307872, "learning_rate": 1.8171391328834638e-06, "loss": 0.0, "num_tokens": 46744346.0, "reward": 0.703125, "reward_std": 0.0646936446428299, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 547 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975343977137235e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.5734767025089607, "grad_norm": 0.21545240402425084, "learning_rate": 1.8164899978189592e-06, "loss": -0.0, "num_tokens": 46836186.0, "reward": 0.6640625, "reward_std": 0.09522313624620438, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 548 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899641578136434e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 1.5763440860215052, "grad_norm": 0.08796547171831597, "learning_rate": 1.8158398289944142e-06, "loss": -0.0, "num_tokens": 46926622.0, "reward": 0.6640625, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 549 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 1.5792114695340502, "grad_norm": 0.0, "learning_rate": 1.8151886272330094e-06, "loss": 0.0, "num_tokens": 47016908.0, "reward": 0.75, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 550 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.582078853046595, "grad_norm": 0.0932480653471435, "learning_rate": 1.8145363933592334e-06, "loss": 0.0, "num_tokens": 47112852.0, "reward": 0.75, "reward_std": 0.13098981976509094, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 551 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.065538545096158e-09, "advantages/std": 0.5726932287216187, "advantages/var": 0.3279775342235922, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.5849462365591398, "grad_norm": 0.09557245277745192, "learning_rate": 1.8138831281988805e-06, "loss": 0.0, "num_tokens": 47204638.0, "reward": 0.8828125, "reward_std": 0.172288179397583, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 552 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975144418133046e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 1.5878136200716846, "grad_norm": 0.14325604527216604, "learning_rate": 1.8132288325790515e-06, "loss": 0.0, "num_tokens": 47286370.0, "reward": 0.7890625, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 553 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.9752099207640785e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.5906810035842294, "grad_norm": 0.1400010191680866, "learning_rate": 1.8125735073281522e-06, "loss": 0.0, "num_tokens": 47371538.0, "reward": 0.7109375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 554 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.25248601345888e-09, "advantages/std": 0.5726835131645203, "advantages/var": 0.32796640625045725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.5935483870967742, "grad_norm": 0.13880001238703485, "learning_rate": 1.8119171532758907e-06, "loss": -0.0, "num_tokens": 47461498.0, "reward": 0.8203125, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 555 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.596415770609319, "grad_norm": 0.06593143826340761, "learning_rate": 1.8112597712532796e-06, "loss": 0.0, "num_tokens": 47527028.0, "reward": 0.8671875, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 556 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 4.929320121387901e-09, "advantages/std": 0.6612735390663147, "advantages/var": 0.43728269346928883, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.599283154121864, "grad_norm": 0.24531721008997973, "learning_rate": 1.810601362092631e-06, "loss": 0.0, "num_tokens": 47606091.0, "reward": 0.7578125, "reward_std": 0.20858673751354218, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 557 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.0689055673126508e-08, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.6021505376344085, "grad_norm": 0.14318215339362345, "learning_rate": 1.809941926627559e-06, "loss": 0.0, "num_tokens": 47678580.0, "reward": 0.828125, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 558 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 7.528039356157216e-09, "advantages/std": 0.6185691356658936, "advantages/var": 0.3826277755984506, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.6050179211469535, "grad_norm": 0.09542996872827302, "learning_rate": 1.8092814656929758e-06, "loss": 0.0, "num_tokens": 47763659.0, "reward": 0.796875, "reward_std": 0.18543371558189392, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 559 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125678014490734e-09, "advantages/std": 0.5227980017662048, "advantages/var": 0.2733177506507367, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.607885304659498, "grad_norm": 0.10388799322074858, "learning_rate": 1.8086199801250934e-06, "loss": 0.0, "num_tokens": 47851559.0, "reward": 0.84375, "reward_std": 0.14571532607078552, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 560 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 1.610752688172043, "grad_norm": 0.19137629780953486, "learning_rate": 1.80795747076142e-06, "loss": 0.0, "num_tokens": 47927931.0, "reward": 0.7109375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 561 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 6.77521322356945e-09, "advantages/std": 0.6185711622238159, "advantages/var": 0.3826302827349224, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.6136200716845877, "grad_norm": 0.16267964778963664, "learning_rate": 1.8072939384407607e-06, "loss": -0.0, "num_tokens": 48011635.0, "reward": 0.8203125, "reward_std": 0.18884865939617157, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 562 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.01121313104812e-09, "advantages/std": 0.6185696721076965, "advantages/var": 0.3826284392514232, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.6164874551971327, "grad_norm": 0.14616920487924576, "learning_rate": 1.8066293840032146e-06, "loss": -0.0, "num_tokens": 48097916.0, "reward": 0.6953125, "reward_std": 0.18649455904960632, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 563 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.131336901697577e-10, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.6193548387096774, "grad_norm": 0.11525279452798919, "learning_rate": 1.8059638082901765e-06, "loss": 0.0, "num_tokens": 48193148.0, "reward": 0.8359375, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 564 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.6222222222222222, "grad_norm": 0.07025099475920485, "learning_rate": 1.8052972121443335e-06, "loss": 0.0, "num_tokens": 48266406.0, "reward": 0.78125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 565 }, { "advantages/mean": -8.381903171539307e-09, "advantages/snr": 1.3550515235317965e-08, "advantages/std": 0.6185671091079712, "advantages/var": 0.38262526847019274, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.625089605734767, "grad_norm": 0.14157810037874768, "learning_rate": 1.8046295964096641e-06, "loss": 0.0, "num_tokens": 48352229.0, "reward": 0.6953125, "reward_std": 0.18201877176761627, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 566 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6185632348060608, "advantages/var": 0.3826204754537379, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 1.6279569892473118, "grad_norm": 0.1390230216267632, "learning_rate": 1.8039609619314389e-06, "loss": 0.0, "num_tokens": 48443657.0, "reward": 0.7890625, "reward_std": 0.17859894037246704, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 567 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.3444321679928155e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.6308243727598566, "grad_norm": 0.1369742479768933, "learning_rate": 1.8032913095562172e-06, "loss": 0.0, "num_tokens": 48532110.0, "reward": 0.71875, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 568 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 2.816760194592258e-09, "advantages/std": 0.6612721681594849, "advantages/var": 0.437280880382346, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.6336917562724014, "grad_norm": 0.16966932036270904, "learning_rate": 1.802620640131848e-06, "loss": -0.0, "num_tokens": 48620955.0, "reward": 0.7265625, "reward_std": 0.20623260736465454, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 569 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.269610284998461e-09, "advantages/std": 0.6185711622238159, "advantages/var": 0.3826302827349224, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.6365591397849464, "grad_norm": 0.1201180445347826, "learning_rate": 1.8019489545074672e-06, "loss": 0.0, "num_tokens": 48712311.0, "reward": 0.7265625, "reward_std": 0.18884865939617157, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 570 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.131246346616979e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.639426523297491, "grad_norm": 0.15013529882265428, "learning_rate": 1.8012762535334975e-06, "loss": 0.0, "num_tokens": 48811971.0, "reward": 0.671875, "reward_std": 0.15650182962417603, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 571 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125942055767658e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.642293906810036, "grad_norm": 0.11477054989531618, "learning_rate": 1.8006025380616478e-06, "loss": 0.0, "num_tokens": 48896368.0, "reward": 0.9140625, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 572 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.3941923669503344e-08, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.6451612903225805, "grad_norm": 0.11789349960518178, "learning_rate": 1.7999278089449108e-06, "loss": 0.0, "num_tokens": 48983655.0, "reward": 0.71875, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 573 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675973355770111, "advantages/var": 0.21864726823871994, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.6480286738351255, "grad_norm": 0.0976815205011024, "learning_rate": 1.7992520670375625e-06, "loss": -0.0, "num_tokens": 49071387.0, "reward": 0.8125, "reward_std": 0.11100948601961136, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 574 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516894993554525e-09, "advantages/std": 0.6185593605041504, "advantages/var": 0.3826156824673035, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.65089605734767, "grad_norm": 0.17323239068739085, "learning_rate": 1.7985753131951614e-06, "loss": 0.0, "num_tokens": 49147131.0, "reward": 0.7890625, "reward_std": 0.17517907917499542, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 575 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.60983260462163e-08, "advantages/std": 0.40496495366096497, "advantages/var": 0.1639966136936275, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.653763440860215, "grad_norm": 0.0841651778614561, "learning_rate": 1.7978975482745477e-06, "loss": 0.0, "num_tokens": 49227436.0, "reward": 0.75, "reward_std": 0.0936255231499672, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 576 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.252522235212141e-09, "advantages/std": 0.5726771354675293, "advantages/var": 0.3279591014872949, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.6566308243727599, "grad_norm": 0.1548230898091453, "learning_rate": 1.7972187731338409e-06, "loss": 0.0, "num_tokens": 49307675.0, "reward": 0.78125, "reward_std": 0.1530819982290268, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 577 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 6.970961834751672e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.6594982078853047, "grad_norm": 0.11624446921524587, "learning_rate": 1.7965389886324397e-06, "loss": 0.0, "num_tokens": 49392352.0, "reward": 0.734375, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 578 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.6623655913978495, "grad_norm": 0.05701063211512371, "learning_rate": 1.7958581956310214e-06, "loss": -0.0, "num_tokens": 49475549.0, "reward": 0.6328125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 579 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "epoch": 1.6652329749103942, "grad_norm": 0.09897261256162439, "learning_rate": 1.7951763949915398e-06, "loss": -0.0, "num_tokens": 49566450.0, "reward": 0.609375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.4898075461387634, "step": 580 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 1.267469408556425e-08, "advantages/std": 0.330655038356781, "advantages/var": 0.10933275439072432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.668100358422939, "grad_norm": 0.056827008086541814, "learning_rate": 1.7944935875772242e-06, "loss": -0.0, "num_tokens": 49644577.0, "reward": 0.734375, "reward_std": 0.0646936446428299, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 581 }, { "advantages/mean": 7.450580596923828e-09, "advantages/snr": 1.062247430213908e-08, "advantages/std": 0.7013978362083435, "advantages/var": 0.49195892463774626, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 1.6709677419354838, "grad_norm": 0.21157890645747224, "learning_rate": 1.7938097742525788e-06, "loss": -0.0, "num_tokens": 49741380.0, "reward": 0.53125, "reward_std": 0.25118327140808105, "rewards/drgrpo_math_reward/mean": 0.53125, "rewards/drgrpo_math_reward/std": 0.5009832978248596, "step": 582 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.6738351254480288, "grad_norm": 0.113043689959348, "learning_rate": 1.7931249558833813e-06, "loss": 0.0, "num_tokens": 49824375.0, "reward": 0.6875, "reward_std": 0.13098980486392975, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 583 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.6767025089605734, "grad_norm": 0.08622293290280962, "learning_rate": 1.792439133336682e-06, "loss": -0.0, "num_tokens": 49894013.0, "reward": 0.84375, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 584 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.6721649001610904e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.6795698924731184, "grad_norm": 0.09241404849302579, "learning_rate": 1.7917523074808022e-06, "loss": -0.0, "num_tokens": 49980856.0, "reward": 0.7890625, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 585 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.682437275985663, "grad_norm": 0.08753320201736134, "learning_rate": 1.7910644791853345e-06, "loss": 0.0, "num_tokens": 50052352.0, "reward": 0.8203125, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 586 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.50555013211365e-09, "advantages/std": 0.6185928583145142, "advantages/var": 0.3826571243577206, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.685304659498208, "grad_norm": 0.12004129638171342, "learning_rate": 1.79037564932114e-06, "loss": -0.0, "num_tokens": 50148699.0, "reward": 0.734375, "reward_std": 0.2161829173564911, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 587 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917480653229804e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.6881720430107527, "grad_norm": 0.07962124500314915, "learning_rate": 1.7896858187603474e-06, "loss": -0.0, "num_tokens": 50214726.0, "reward": 0.859375, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 588 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.379887186086637e-08, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.6910394265232975, "grad_norm": 0.0806365501054856, "learning_rate": 1.788994988376353e-06, "loss": 0.0, "num_tokens": 50299840.0, "reward": 0.8828125, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 589 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.6939068100358423, "grad_norm": 0.07540264432819178, "learning_rate": 1.7883031590438194e-06, "loss": -0.0, "num_tokens": 50383747.0, "reward": 0.6328125, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 590 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5056078712314432e-09, "advantages/std": 0.6185691356658936, "advantages/var": 0.3826277755984506, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.696774193548387, "grad_norm": 0.1138498837367993, "learning_rate": 1.7876103316386727e-06, "loss": 0.0, "num_tokens": 50469109.0, "reward": 0.6875, "reward_std": 0.18543371558189392, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 591 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 8.450146530707848e-09, "advantages/std": 0.6612826585769653, "advantages/var": 0.4372947545346193, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.6996415770609319, "grad_norm": 0.13635243870664232, "learning_rate": 1.7869165070381043e-06, "loss": 0.0, "num_tokens": 50552794.0, "reward": 0.8125, "reward_std": 0.21884137392044067, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 592 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.907227504745508e-10, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.7025089605734767, "grad_norm": 0.10518691727099587, "learning_rate": 1.7862216861205667e-06, "loss": -0.0, "num_tokens": 50627662.0, "reward": 0.8125, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 593 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299802498719973e-09, "advantages/std": 0.4049576222896576, "advantages/var": 0.16399067585049298, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.7053763440860215, "grad_norm": 0.08787262259127059, "learning_rate": 1.7855258697657746e-06, "loss": 0.0, "num_tokens": 50712403.0, "reward": 0.8046875, "reward_std": 0.08891239762306213, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 594 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.7082437275985662, "grad_norm": 0.06658110423735736, "learning_rate": 1.7848290588547026e-06, "loss": 0.0, "num_tokens": 50800496.0, "reward": 0.8125, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 595 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 1.7111111111111112, "grad_norm": 0.036404286414256665, "learning_rate": 1.7841312542695848e-06, "loss": 0.0, "num_tokens": 50882390.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 596 }, { "advantages/mean": 5.122274160385132e-09, "advantages/snr": 1.0954489382432772e-08, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.7139784946236558, "grad_norm": 0.09944774588564752, "learning_rate": 1.7834324568939136e-06, "loss": -0.0, "num_tokens": 50957746.0, "reward": 0.796875, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 597 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 1.7168458781362008, "grad_norm": 0.04898368888650664, "learning_rate": 1.782732667612438e-06, "loss": -0.0, "num_tokens": 51036336.0, "reward": 0.75, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 598 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.979230209351863e-09, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.7197132616487454, "grad_norm": 0.07891434988260301, "learning_rate": 1.7820318873111626e-06, "loss": -0.0, "num_tokens": 51115891.0, "reward": 0.7890625, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 599 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.24699507424554e-08, "advantages/std": 0.5227974057197571, "advantages/var": 0.2733171274273083, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.7225806451612904, "grad_norm": 0.10373118203585466, "learning_rate": 1.7813301168773478e-06, "loss": -0.0, "num_tokens": 51195517.0, "reward": 0.7421875, "reward_std": 0.14465448260307312, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 600 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983629174425397e-09, "advantages/std": 0.46757492423057556, "advantages/var": 0.21862630976922848, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.7254480286738352, "grad_norm": 0.08823420239986755, "learning_rate": 1.7806273571995065e-06, "loss": -0.0, "num_tokens": 51274101.0, "reward": 0.796875, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 601 }, { "advantages/mean": -9.313225746154785e-09, "advantages/snr": 1.6262067861863043e-08, "advantages/std": 0.5726962685585022, "advantages/var": 0.3279810160208321, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.72831541218638, "grad_norm": 0.11082640603899152, "learning_rate": 1.7799236091674045e-06, "loss": -0.0, "num_tokens": 51361658.0, "reward": 0.7890625, "reward_std": 0.17358636856079102, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 602 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 1.7311827956989247, "grad_norm": 0.028673913004331607, "learning_rate": 1.779218873672059e-06, "loss": -0.0, "num_tokens": 51443992.0, "reward": 0.734375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 603 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 8.049402934764073e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.7340501792114695, "grad_norm": 0.08853857547337998, "learning_rate": 1.7785131516057374e-06, "loss": -0.0, "num_tokens": 51514409.0, "reward": 0.84375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 604 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 1.7369175627240143, "grad_norm": 0.08212926446077826, "learning_rate": 1.7778064438619559e-06, "loss": -0.0, "num_tokens": 51606124.0, "reward": 0.71875, "reward_std": 0.11678344756364822, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 605 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983467187183905e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.739784946236559, "grad_norm": 0.08105299644208669, "learning_rate": 1.7770987513354796e-06, "loss": 0.0, "num_tokens": 51690944.0, "reward": 0.765625, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 606 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.065668450848788e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 1.742652329749104, "grad_norm": 0.09925762413655309, "learning_rate": 1.7763900749223194e-06, "loss": -0.0, "num_tokens": 51786435.0, "reward": 0.6484375, "reward_std": 0.14966703951358795, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 607 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.13111686481873e-09, "advantages/std": 0.5726904273033142, "advantages/var": 0.3279743255248526, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.7455197132616487, "grad_norm": 0.1563651934669506, "learning_rate": 1.7756804155197322e-06, "loss": 0.0, "num_tokens": 51865124.0, "reward": 0.7265625, "reward_std": 0.16781240701675415, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 608 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 7.966936970044213e-09, "advantages/std": 0.7013906836509705, "advantages/var": 0.4919488911123757, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 1.7483870967741937, "grad_norm": 0.12125368324296161, "learning_rate": 1.7749697740262195e-06, "loss": 0.0, "num_tokens": 51958950.0, "reward": 0.703125, "reward_std": 0.23752352595329285, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 609 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 7.96693437436781e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.7512544802867382, "grad_norm": 0.08696671817760408, "learning_rate": 1.7742581513415265e-06, "loss": -0.0, "num_tokens": 52035790.0, "reward": 0.734375, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 610 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.011171929325739e-09, "advantages/std": 0.6185781359672546, "advantages/var": 0.38263891029672337, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.7541218637992833, "grad_norm": 0.14584663912796295, "learning_rate": 1.7735455483666404e-06, "loss": -0.0, "num_tokens": 52129039.0, "reward": 0.6875, "reward_std": 0.19780512154102325, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 611 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.264868475059852e-08, "advantages/std": 0.40496495366096497, "advantages/var": 0.1639966136936275, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.7569892473118278, "grad_norm": 0.08213421867548132, "learning_rate": 1.7728319660037897e-06, "loss": 0.0, "num_tokens": 52202394.0, "reward": 0.875, "reward_std": 0.0936255231499672, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 612 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.7598566308243728, "grad_norm": 0.11010447507234923, "learning_rate": 1.7721174051564426e-06, "loss": 0.0, "num_tokens": 52280139.0, "reward": 0.8828125, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 613 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 7.31820321152782e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.7627240143369176, "grad_norm": 0.0884489361838935, "learning_rate": 1.771401866729307e-06, "loss": -0.0, "num_tokens": 52362540.0, "reward": 0.7421875, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 614 }, { "advantages/mean": -8.381903171539307e-09, "advantages/snr": 1.2675148962721033e-08, "advantages/std": 0.6612863540649414, "advantages/var": 0.43729964207250305, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.7655913978494624, "grad_norm": 0.1390975581270697, "learning_rate": 1.7706853516283269e-06, "loss": 0.0, "num_tokens": 52450800.0, "reward": 0.78125, "reward_std": 0.2222612351179123, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 615 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.294576256939553e-08, "advantages/std": 0.46761220693588257, "advantages/var": 0.21866117607544666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 1.7684587813620072, "grad_norm": 0.08340422526536204, "learning_rate": 1.7699678607606848e-06, "loss": -0.0, "num_tokens": 52539594.0, "reward": 0.6640625, "reward_std": 0.1236182302236557, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 616 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.958206547585018e-10, "advantages/std": 0.46761560440063477, "advantages/var": 0.21866435347897095, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 1.771326164874552, "grad_norm": 0.08804980976750112, "learning_rate": 1.7692493950347968e-06, "loss": 0.0, "num_tokens": 52619699.0, "reward": 0.8046875, "reward_std": 0.12809400260448456, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 617 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.970961834751672e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.7741935483870968, "grad_norm": 0.09704645442338684, "learning_rate": 1.768529955360315e-06, "loss": 0.0, "num_tokens": 52693760.0, "reward": 0.875, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 618 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125814501076877e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 1.7770609318996415, "grad_norm": 0.09728208620780422, "learning_rate": 1.7678095426481235e-06, "loss": -0.0, "num_tokens": 52785410.0, "reward": 0.6640625, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 619 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9916745663410504e-09, "advantages/std": 0.4676077961921692, "advantages/var": 0.21865705105969724, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.7799283154121865, "grad_norm": 0.0590535191981089, "learning_rate": 1.7670881578103383e-06, "loss": 0.0, "num_tokens": 52870139.0, "reward": 0.765625, "reward_std": 0.12125921994447708, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 620 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.626265687704098e-09, "advantages/std": 0.5726755261421204, "advantages/var": 0.3279572582421544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 1.782795698924731, "grad_norm": 0.11417855950003987, "learning_rate": 1.7663658017603072e-06, "loss": 0.0, "num_tokens": 52966074.0, "reward": 0.71875, "reward_std": 0.15072788298130035, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 621 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344336502847305e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 1.7856630824372761, "grad_norm": 0.10528707358064278, "learning_rate": 1.7656424754126064e-06, "loss": -0.0, "num_tokens": 53053990.0, "reward": 0.78125, "reward_std": 0.13781969249248505, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 622 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125731633597449e-09, "advantages/std": 0.5227940678596497, "advantages/var": 0.27331363738923997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 1.7885304659498207, "grad_norm": 0.16258954346364168, "learning_rate": 1.7649181796830415e-06, "loss": 0.0, "num_tokens": 53144189.0, "reward": 0.6484375, "reward_std": 0.1433562934398651, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 623 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.2583643671009597e-09, "advantages/std": 0.6185821294784546, "advantages/var": 0.38264385091009956, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.7913978494623657, "grad_norm": 0.1303926158518905, "learning_rate": 1.7641929154886453e-06, "loss": 0.0, "num_tokens": 53228553.0, "reward": 0.75, "reward_std": 0.20463499426841736, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 624 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958429318795977e-10, "advantages/std": 0.4676051437854767, "advantages/var": 0.21865457049463632, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.7942652329749103, "grad_norm": 0.1150785865190137, "learning_rate": 1.7634666837476763e-06, "loss": -0.0, "num_tokens": 53314514.0, "reward": 0.7578125, "reward_std": 0.11784426867961884, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 625 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983473280509385e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.7971326164874553, "grad_norm": 0.08192531188502497, "learning_rate": 1.7627394853796184e-06, "loss": 0.0, "num_tokens": 53394009.0, "reward": 0.8203125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 626 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.0111745406690435e-09, "advantages/std": 0.6185775995254517, "advantages/var": 0.38263824663467005, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 1.8, "grad_norm": 0.1163617284955123, "learning_rate": 1.7620113213051795e-06, "loss": -0.0, "num_tokens": 53494339.0, "reward": 0.6328125, "reward_std": 0.19674427807331085, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 627 }, { "advantages/mean": 6.05359673500061e-09, "advantages/snr": 9.786294625565132e-09, "advantages/std": 0.6185790300369263, "advantages/var": 0.38264001640142453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.8028673835125448, "grad_norm": 0.12523098720127152, "learning_rate": 1.76128219244629e-06, "loss": -0.0, "num_tokens": 53580382.0, "reward": 0.7421875, "reward_std": 0.1990984082221985, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 628 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.3443230978138685e-09, "advantages/std": 0.5227916836738586, "advantages/var": 0.2733111445185479, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.8057347670250896, "grad_norm": 0.10023370396243167, "learning_rate": 1.7605520997261011e-06, "loss": 0.0, "num_tokens": 53668316.0, "reward": 0.71875, "reward_std": 0.13994136452674866, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 629 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.974957819673592e-09, "advantages/std": 0.46761295199394226, "advantages/var": 0.21866187287248895, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.8086021505376344, "grad_norm": 0.2227891417965325, "learning_rate": 1.7598210440689857e-06, "loss": -0.0, "num_tokens": 53738437.0, "reward": 0.84375, "reward_std": 0.1246790662407875, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 630 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.126037115417672e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.8114695340501792, "grad_norm": 0.17391472702432173, "learning_rate": 1.7590890264005354e-06, "loss": 0.0, "num_tokens": 53829447.0, "reward": 0.640625, "reward_std": 0.1173202320933342, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 631 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262736431211962e-09, "advantages/std": 0.5726727247238159, "advantages/var": 0.32795404964259944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.814336917562724, "grad_norm": 0.172820094067218, "learning_rate": 1.7583560476475587e-06, "loss": 0.0, "num_tokens": 53901473.0, "reward": 0.75, "reward_std": 0.1462521106004715, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 632 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 7.966544250856589e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.817204301075269, "grad_norm": 0.050725545773023864, "learning_rate": 1.757622108738083e-06, "loss": -0.0, "num_tokens": 53966130.0, "reward": 0.9609375, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 633 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814855139419146e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.8200716845878135, "grad_norm": 0.09921205484552398, "learning_rate": 1.756887210601349e-06, "loss": -0.0, "num_tokens": 54045729.0, "reward": 0.8203125, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 634 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814193005215579e-09, "advantages/std": 0.5227980613708496, "advantages/var": 0.27331781297311863, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.8229390681003586, "grad_norm": 0.11264600133960613, "learning_rate": 1.7561513541678141e-06, "loss": -0.0, "num_tokens": 54130690.0, "reward": 0.734375, "reward_std": 0.14571532607078552, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 635 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 8.131160024959952e-09, "advantages/std": 0.5726873874664307, "advantages/var": 0.3279708437631257, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 1.8258064516129031, "grad_norm": 0.12736837348410912, "learning_rate": 1.7554145403691473e-06, "loss": 0.0, "num_tokens": 54217523.0, "reward": 0.7734375, "reward_std": 0.16651421785354614, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 636 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.247008722823236e-08, "advantages/std": 0.5227916836738586, "advantages/var": 0.2733111445185479, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 1.8286738351254481, "grad_norm": 0.12924698284111052, "learning_rate": 1.7546767701382308e-06, "loss": -0.0, "num_tokens": 54301592.0, "reward": 0.671875, "reward_std": 0.13994136452674866, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 637 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344354173221399e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 1.8315412186379927, "grad_norm": 0.09778556658427673, "learning_rate": 1.7539380444091571e-06, "loss": -0.0, "num_tokens": 54391954.0, "reward": 0.75, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 638 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.5629027821475993e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 1.8344086021505377, "grad_norm": 0.11683676138661579, "learning_rate": 1.753198364117229e-06, "loss": 0.0, "num_tokens": 54481350.0, "reward": 0.828125, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 639 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.5995914574054795e-09, "advantages/std": 0.4049588143825531, "advantages/var": 0.1639916413461231, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.8372759856630825, "grad_norm": 0.07753623795481836, "learning_rate": 1.7524577301989572e-06, "loss": -0.0, "num_tokens": 54552292.0, "reward": 0.796875, "reward_std": 0.0867956355214119, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 640 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.0689006924160064e-08, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.8401433691756273, "grad_norm": 0.11752012585465188, "learning_rate": 1.7517161435920605e-06, "loss": 0.0, "num_tokens": 54632333.0, "reward": 0.9453125, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 641 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016541313711486e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.843010752688172, "grad_norm": 0.09296534465920685, "learning_rate": 1.7509736052354632e-06, "loss": -0.0, "num_tokens": 54715302.0, "reward": 0.8828125, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 642 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.8458781362007168, "grad_norm": 0.11737325425319424, "learning_rate": 1.750230116069295e-06, "loss": 0.0, "num_tokens": 54796941.0, "reward": 0.6796875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 643 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.439391676409106e-09, "advantages/std": 0.5726771354675293, "advantages/var": 0.3279591014872949, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.8487455197132616, "grad_norm": 0.10768871696196604, "learning_rate": 1.7494856770348903e-06, "loss": -0.0, "num_tokens": 54877517.0, "reward": 0.734375, "reward_std": 0.1530819982290268, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 644 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.3444254652277355e-09, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.8516129032258064, "grad_norm": 0.13384002189732117, "learning_rate": 1.748740289074784e-06, "loss": -0.0, "num_tokens": 54947818.0, "reward": 0.8828125, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 645 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 6.639245037827722e-10, "advantages/std": 0.7013768553733826, "advantages/var": 0.4919294932534548, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.8544802867383514, "grad_norm": 0.18004331924682504, "learning_rate": 1.7479939531327144e-06, "loss": 0.0, "num_tokens": 55039411.0, "reward": 0.625, "reward_std": 0.22043409943580627, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 646 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 9.797937967257332e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.857347670250896, "grad_norm": 0.09604307611587987, "learning_rate": 1.747246670153619e-06, "loss": 0.0, "num_tokens": 55129354.0, "reward": 0.8515625, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 647 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.860215053763441, "grad_norm": 0.08548359825937912, "learning_rate": 1.746498441083635e-06, "loss": 0.0, "num_tokens": 55198994.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 648 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 6.77520147229019e-09, "advantages/std": 0.6185722351074219, "advantages/var": 0.3826316100457916, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.8630824372759855, "grad_norm": 0.15433130304994333, "learning_rate": 1.7457492668700967e-06, "loss": -0.0, "num_tokens": 55276732.0, "reward": 0.8359375, "reward_std": 0.19097033143043518, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 649 }, { "advantages/mean": 5.122274160385132e-09, "advantages/snr": 1.0954289704678737e-08, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.8659498207885306, "grad_norm": 0.17622307002850085, "learning_rate": 1.7449991484615359e-06, "loss": -0.0, "num_tokens": 55356997.0, "reward": 0.875, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 650 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878766595896755e-09, "advantages/std": 0.5726791024208069, "advantages/var": 0.327961354349501, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 1.8688172043010751, "grad_norm": 0.10472142185864963, "learning_rate": 1.7442480868076789e-06, "loss": 0.0, "num_tokens": 55446748.0, "reward": 0.7421875, "reward_std": 0.15308690071105957, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 651 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.8716845878136201, "grad_norm": 0.07288459873608755, "learning_rate": 1.743496082859447e-06, "loss": -0.0, "num_tokens": 55525801.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 652 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917480653229804e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.874551971326165, "grad_norm": 0.08759904484132258, "learning_rate": 1.7427431375689543e-06, "loss": 0.0, "num_tokens": 55598059.0, "reward": 0.921875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 653 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.8774193548387097, "grad_norm": 0.07013332630729088, "learning_rate": 1.7419892518895067e-06, "loss": 0.0, "num_tokens": 55682674.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 654 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917480653229804e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.8802867383512545, "grad_norm": 0.09070016569887124, "learning_rate": 1.7412344267756009e-06, "loss": 0.0, "num_tokens": 55766402.0, "reward": 0.71875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 655 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966756148857264e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.8831541218637993, "grad_norm": 0.12727931385893082, "learning_rate": 1.7404786631829226e-06, "loss": 0.0, "num_tokens": 55843660.0, "reward": 0.859375, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 656 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.8788021410185465e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.886021505376344, "grad_norm": 0.10677855855478481, "learning_rate": 1.7397219620683463e-06, "loss": -0.0, "num_tokens": 55919559.0, "reward": 0.7265625, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 657 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199317639730369e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.8888888888888888, "grad_norm": 0.16188825238468785, "learning_rate": 1.738964324389933e-06, "loss": 0.0, "num_tokens": 55991862.0, "reward": 0.828125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 658 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.63344972278721e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.8917562724014338, "grad_norm": 0.08240144468938987, "learning_rate": 1.7382057511069296e-06, "loss": 0.0, "num_tokens": 56059473.0, "reward": 0.96875, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 659 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6721323019547286e-09, "advantages/std": 0.5227974057197571, "advantages/var": 0.2733171274273083, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.8946236559139784, "grad_norm": 0.10283567421926562, "learning_rate": 1.737446243179768e-06, "loss": 0.0, "num_tokens": 56131773.0, "reward": 0.8984375, "reward_std": 0.14465448260307312, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 660 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.8974910394265234, "grad_norm": 0.07697940771219978, "learning_rate": 1.7366858015700624e-06, "loss": 0.0, "num_tokens": 56216211.0, "reward": 0.640625, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 661 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 3.983471193166227e-09, "advantages/std": 0.7013902068138123, "advantages/var": 0.49194822221432233, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.900358422939068, "grad_norm": 0.1300291889631906, "learning_rate": 1.7359244272406107e-06, "loss": 0.0, "num_tokens": 56321214.0, "reward": 0.5859375, "reward_std": 0.23646268248558044, "rewards/drgrpo_math_reward/mean": 0.5859375, "rewards/drgrpo_math_reward/std": 0.49449479579925537, "step": 662 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.903225806451613, "grad_norm": 0.05520265524593956, "learning_rate": 1.73516212115539e-06, "loss": -0.0, "num_tokens": 56401868.0, "reward": 0.8203125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 663 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.672108843924502e-09, "advantages/std": 0.5228019952774048, "advantages/var": 0.2733219262660356, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.9060931899641576, "grad_norm": 0.13582013022578326, "learning_rate": 1.7343988842795584e-06, "loss": 0.0, "num_tokens": 56493224.0, "reward": 0.7421875, "reward_std": 0.14807432889938354, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 664 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.970744583309513e-09, "advantages/std": 0.46761560440063477, "advantages/var": 0.21866435347897095, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.9089605734767026, "grad_norm": 0.13227371644183966, "learning_rate": 1.7336347175794521e-06, "loss": 0.0, "num_tokens": 56565927.0, "reward": 0.7109375, "reward_std": 0.12809400260448456, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 665 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.0539355639134917e-08, "advantages/std": 0.6185632348060608, "advantages/var": 0.3826204754537379, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.9118279569892473, "grad_norm": 0.12795985341564062, "learning_rate": 1.7328696220225845e-06, "loss": 0.0, "num_tokens": 56652668.0, "reward": 0.7421875, "reward_std": 0.17859894037246704, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 666 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 6.8993206715259576e-09, "advantages/std": 0.4049627184867859, "advantages/var": 0.1639948033642078, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.703125, "epoch": 1.9146953405017921, "grad_norm": 0.07328231464595761, "learning_rate": 1.732103598577645e-06, "loss": 0.0, "num_tokens": 56749515.0, "reward": 0.625, "reward_std": 0.09127141535282135, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 667 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.917562724014337, "grad_norm": 0.12914763253719083, "learning_rate": 1.7313366482144973e-06, "loss": -0.0, "num_tokens": 56833761.0, "reward": 0.7578125, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 668 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899474012416308e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.9204301075268817, "grad_norm": 0.07890408596078594, "learning_rate": 1.7305687719041798e-06, "loss": 0.0, "num_tokens": 56919886.0, "reward": 0.7578125, "reward_std": 0.08443662524223328, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 669 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.9232974910394265, "grad_norm": 0.09684022294367929, "learning_rate": 1.7297999706189025e-06, "loss": 0.0, "num_tokens": 56989434.0, "reward": 0.953125, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 670 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 7.528014692854944e-09, "advantages/std": 0.6185711622238159, "advantages/var": 0.3826302827349224, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.9261648745519713, "grad_norm": 0.21154492643552217, "learning_rate": 1.7290302453320465e-06, "loss": 0.0, "num_tokens": 57073367.0, "reward": 0.7890625, "reward_std": 0.18884867429733276, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 671 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.528079978419127e-10, "advantages/std": 0.6185657978057861, "advantages/var": 0.3826236462151087, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.9290322580645163, "grad_norm": 0.14458294656914567, "learning_rate": 1.7282595970181628e-06, "loss": 0.0, "num_tokens": 57148824.0, "reward": 0.7734375, "reward_std": 0.1830746978521347, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 672 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966756148857264e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.9318996415770608, "grad_norm": 0.1066687707439637, "learning_rate": 1.7274880266529715e-06, "loss": 0.0, "num_tokens": 57235947.0, "reward": 0.734375, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 673 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.9347670250896059, "grad_norm": 0.08970681835046851, "learning_rate": 1.7267155352133598e-06, "loss": 0.0, "num_tokens": 57314343.0, "reward": 0.8046875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 674 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.9376344086021504, "grad_norm": 0.037950397904259026, "learning_rate": 1.7259421236773806e-06, "loss": -0.0, "num_tokens": 57398252.0, "reward": 0.7421875, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 675 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.0954734451444e-08, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 1.9405017921146954, "grad_norm": 0.07044606949811252, "learning_rate": 1.7251677930242524e-06, "loss": 0.0, "num_tokens": 57472101.0, "reward": 0.890625, "reward_std": 0.09863808006048203, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 676 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.1312150336472e-10, "advantages/std": 0.5726835131645203, "advantages/var": 0.32796640625045725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.94336917562724, "grad_norm": 0.13746445766784376, "learning_rate": 1.7243925442343575e-06, "loss": 0.0, "num_tokens": 57574344.0, "reward": 0.7109375, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 677 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.344384639658041e-09, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 1.946236559139785, "grad_norm": 0.11281681887095754, "learning_rate": 1.7236163782892402e-06, "loss": 0.0, "num_tokens": 57650886.0, "reward": 0.828125, "reward_std": 0.13098981976509094, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 678 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 7.52801541824388e-09, "advantages/std": 0.6185711026191711, "advantages/var": 0.38263020899549716, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 1.9491039426523298, "grad_norm": 0.11326927243511467, "learning_rate": 1.7228392961716058e-06, "loss": -0.0, "num_tokens": 57739335.0, "reward": 0.7109375, "reward_std": 0.18884865939617157, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 679 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5228016972541809, "advantages/var": 0.27332161465185223, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.9519713261648746, "grad_norm": 0.1003676774596821, "learning_rate": 1.72206129886532e-06, "loss": 0.0, "num_tokens": 57820745.0, "reward": 0.7109375, "reward_std": 0.15125194191932678, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 680 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125868122237681e-09, "advantages/std": 0.5227840542793274, "advantages/var": 0.27330316740873073, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.9548387096774194, "grad_norm": 0.11503852642526637, "learning_rate": 1.7212823873554077e-06, "loss": 0.0, "num_tokens": 57900871.0, "reward": 0.75, "reward_std": 0.13204574584960938, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 681 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.9577060931899641, "grad_norm": 0.050713752984807875, "learning_rate": 1.72050256262805e-06, "loss": 0.0, "num_tokens": 57992278.0, "reward": 0.8125, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 682 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 1.2674839170101503e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.960573476702509, "grad_norm": 0.04396861649262558, "learning_rate": 1.7197218256705857e-06, "loss": 0.0, "num_tokens": 58073213.0, "reward": 0.9140625, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 683 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.764055585405327e-09, "advantages/std": 0.6185632348060608, "advantages/var": 0.3826204754537379, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 1.9634408602150537, "grad_norm": 0.11596307369386666, "learning_rate": 1.7189401774715072e-06, "loss": 0.0, "num_tokens": 58166340.0, "reward": 0.7890625, "reward_std": 0.17859892547130585, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 684 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.9663082437275987, "grad_norm": 0.06345341305669575, "learning_rate": 1.7181576190204616e-06, "loss": -0.0, "num_tokens": 58246745.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 685 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.453574654603735e-09, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.9691756272401433, "grad_norm": 0.15361994081883612, "learning_rate": 1.7173741513082478e-06, "loss": -0.0, "num_tokens": 58326136.0, "reward": 0.8125, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 686 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 1.9720430107526883, "grad_norm": 0.05207869475321715, "learning_rate": 1.7165897753268165e-06, "loss": -0.0, "num_tokens": 58407208.0, "reward": 0.8671875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 687 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 1.1266652673742488e-08, "advantages/std": 0.33064746856689453, "advantages/var": 0.10932774846969551, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.9749103942652328, "grad_norm": 0.11293308439040252, "learning_rate": 1.7158044920692682e-06, "loss": -0.0, "num_tokens": 58490331.0, "reward": 0.84375, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 688 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 1.9777777777777779, "grad_norm": 0.08567692143451083, "learning_rate": 1.7150183025298517e-06, "loss": 0.0, "num_tokens": 58568099.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 689 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5056078712314432e-09, "advantages/std": 0.6185691356658936, "advantages/var": 0.3826277755984506, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.9806451612903224, "grad_norm": 0.15087248324596125, "learning_rate": 1.7142312077039638e-06, "loss": -0.0, "num_tokens": 58644861.0, "reward": 0.75, "reward_std": 0.18543371558189392, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 690 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.198954094593892e-09, "advantages/std": 0.40496888756752014, "advantages/var": 0.16399979989767477, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 1.9835125448028674, "grad_norm": 0.08803479172788438, "learning_rate": 1.7134432085881469e-06, "loss": 0.0, "num_tokens": 58728309.0, "reward": 0.859375, "reward_std": 0.09810129553079605, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 691 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 1.0348901800453944e-08, "advantages/std": 0.4049658179283142, "advantages/var": 0.16399731369034853, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 1.9863799283154122, "grad_norm": 0.09670088622860404, "learning_rate": 1.7126543061800893e-06, "loss": 0.0, "num_tokens": 58805589.0, "reward": 0.7578125, "reward_std": 0.0946863517165184, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 692 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.0954734451444e-08, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 1.989247311827957, "grad_norm": 0.09839509805314722, "learning_rate": 1.7118645014786216e-06, "loss": 0.0, "num_tokens": 58874408.0, "reward": 0.953125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 693 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 1.9921146953405018, "grad_norm": 0.0651367992382115, "learning_rate": 1.7110737954837182e-06, "loss": 0.0, "num_tokens": 58947583.0, "reward": 0.6953125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 694 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.168766962023389e-09, "advantages/std": 0.6612905859947205, "advantages/var": 0.4373052391252408, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 1.9949820788530466, "grad_norm": 0.13001653169016658, "learning_rate": 1.7102821891964933e-06, "loss": 0.0, "num_tokens": 59036599.0, "reward": 0.6953125, "reward_std": 0.2301519364118576, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 695 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199317639730369e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 1.9978494623655914, "grad_norm": 0.13689238215628702, "learning_rate": 1.7094896836192021e-06, "loss": 0.0, "num_tokens": 59112092.0, "reward": 0.890625, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 696 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 5.0387450601867105e-09, "advantages/std": 0.7393289804458618, "advantages/var": 0.5466073413271175, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 2.002867383512545, "grad_norm": 0.14738650917438545, "learning_rate": 1.7086962797552372e-06, "loss": 0.0, "num_tokens": 59210297.0, "reward": 0.6796875, "reward_std": 0.2619746923446655, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 697 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.1579757501173404e-08, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.0057347670250896, "grad_norm": 0.10589172183649662, "learning_rate": 1.7079019786091304e-06, "loss": 0.0, "num_tokens": 59274800.0, "reward": 0.9453125, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 698 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599659158374989e-09, "advantages/std": 0.40495285391807556, "advantages/var": 0.16398681389639425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.0086021505376346, "grad_norm": 0.10012153562307823, "learning_rate": 1.7071067811865474e-06, "loss": 0.0, "num_tokens": 59351944.0, "reward": 0.78125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 699 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7815330444009752e-09, "advantages/std": 0.5227646827697754, "advantages/var": 0.2732829135513839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.011469534050179, "grad_norm": 0.07468079317511114, "learning_rate": 1.7063106884942902e-06, "loss": 0.0, "num_tokens": 59431785.0, "reward": 0.8671875, "reward_std": 0.11048543453216553, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 700 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599795920412534e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 2.014336917562724, "grad_norm": 0.06906182952801246, "learning_rate": 1.7055137015402932e-06, "loss": -0.0, "num_tokens": 59517257.0, "reward": 0.796875, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 701 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.0172043010752687, "grad_norm": 0.08192055329133872, "learning_rate": 1.7047158213336241e-06, "loss": 0.0, "num_tokens": 59593235.0, "reward": 0.8203125, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 702 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 7.528008164360805e-09, "advantages/std": 0.6185716986656189, "advantages/var": 0.3826309463900692, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.0200716845878137, "grad_norm": 0.12550527758126853, "learning_rate": 1.703917048884481e-06, "loss": 0.0, "num_tokens": 59676199.0, "reward": 0.84375, "reward_std": 0.18990948796272278, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 703 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.3442097670369564e-09, "advantages/std": 0.5228027701377869, "advantages/var": 0.2733227364637436, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.0229390681003583, "grad_norm": 0.13322333703758604, "learning_rate": 1.7031173852041914e-06, "loss": -0.0, "num_tokens": 59762217.0, "reward": 0.8125, "reward_std": 0.15254521369934082, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 704 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.0258064516129033, "grad_norm": 0.04565102881553923, "learning_rate": 1.7023168313052115e-06, "loss": 0.0, "num_tokens": 59857822.0, "reward": 0.6796875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 705 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917813257124117e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.028673835125448, "grad_norm": 0.09427426790565954, "learning_rate": 1.701515388201125e-06, "loss": -0.0, "num_tokens": 59933853.0, "reward": 0.8515625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 706 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983473280509385e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.031541218637993, "grad_norm": 0.15518696240852958, "learning_rate": 1.700713056906641e-06, "loss": -0.0, "num_tokens": 60010800.0, "reward": 0.8984375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 707 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299773557175319e-09, "advantages/std": 0.4049627184867859, "advantages/var": 0.1639948033642078, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.0344086021505374, "grad_norm": 0.06849252314183808, "learning_rate": 1.6999098384375928e-06, "loss": -0.0, "num_tokens": 60094064.0, "reward": 0.90625, "reward_std": 0.09127141535282135, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 708 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.878651331707719e-09, "advantages/std": 0.5726926326751709, "advantages/var": 0.3279768515204182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.0372759856630824, "grad_norm": 0.1139504153736027, "learning_rate": 1.6991057338109374e-06, "loss": 0.0, "num_tokens": 60182913.0, "reward": 0.75, "reward_std": 0.1712273508310318, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 709 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.504900262044325e-09, "advantages/std": 0.5726898312568665, "advantages/var": 0.3279736428250182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.0401433691756274, "grad_norm": 0.11041081468071696, "learning_rate": 1.6983007440447539e-06, "loss": 0.0, "num_tokens": 60277855.0, "reward": 0.703125, "reward_std": 0.16675157845020294, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 710 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917699002625455e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 2.043010752688172, "grad_norm": 0.11390562174606526, "learning_rate": 1.6974948701582417e-06, "loss": 0.0, "num_tokens": 60374112.0, "reward": 0.671875, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 711 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814432667740602e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.045878136200717, "grad_norm": 0.09113302680893781, "learning_rate": 1.6966881131717196e-06, "loss": 0.0, "num_tokens": 60458057.0, "reward": 0.8515625, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 712 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.599795920412534e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.0487455197132616, "grad_norm": 0.09052835603090768, "learning_rate": 1.6958804741066252e-06, "loss": 0.0, "num_tokens": 60540298.0, "reward": 0.765625, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 713 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917813257124117e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.0516129032258066, "grad_norm": 0.11208448837819521, "learning_rate": 1.695071953985512e-06, "loss": 0.0, "num_tokens": 60616944.0, "reward": 0.7578125, "reward_std": 0.09522313624620438, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 714 }, { "advantages/mean": 5.122274160385132e-09, "advantages/snr": 1.09543064605741e-08, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.054480286738351, "grad_norm": 0.11787645732740036, "learning_rate": 1.6942625538320492e-06, "loss": -0.0, "num_tokens": 60709383.0, "reward": 0.7265625, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 715 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983539800525091e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.057347670250896, "grad_norm": 0.08071633968502825, "learning_rate": 1.6934522746710204e-06, "loss": 0.0, "num_tokens": 60797237.0, "reward": 0.78125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 716 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 8.131003465846817e-09, "advantages/std": 0.5726984143257141, "advantages/var": 0.3279834737711873, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.0602150537634407, "grad_norm": 0.15739203678265482, "learning_rate": 1.6926411175283227e-06, "loss": -0.0, "num_tokens": 60890581.0, "reward": 0.75, "reward_std": 0.17700129747390747, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 717 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 7.3180942919464596e-09, "advantages/std": 0.5726834535598755, "advantages/var": 0.32796633798126607, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 2.0630824372759857, "grad_norm": 0.14517907380286715, "learning_rate": 1.6918290834309631e-06, "loss": 0.0, "num_tokens": 60983682.0, "reward": 0.6640625, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 718 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299811976811062e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.0659498207885303, "grad_norm": 0.10471779058716572, "learning_rate": 1.691016173407061e-06, "loss": 0.0, "num_tokens": 61065503.0, "reward": 0.7421875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 719 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917366402546925e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.0688172043010753, "grad_norm": 0.07649994559337656, "learning_rate": 1.6902023884858436e-06, "loss": 0.0, "num_tokens": 61149911.0, "reward": 0.8671875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 720 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.87871074032323e-09, "advantages/std": 0.5726856589317322, "advantages/var": 0.3279688639460723, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 2.07168458781362, "grad_norm": 0.1380323189643192, "learning_rate": 1.6893877296976457e-06, "loss": 0.0, "num_tokens": 61233905.0, "reward": 0.75, "reward_std": 0.16333171725273132, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 721 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344354173221399e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.074551971326165, "grad_norm": 0.11343862959636024, "learning_rate": 1.6885721980739089e-06, "loss": -0.0, "num_tokens": 61318503.0, "reward": 0.6875, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 722 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.7495735248314805e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.07741935483871, "grad_norm": 0.08660213837979328, "learning_rate": 1.6877557946471805e-06, "loss": -0.0, "num_tokens": 61399464.0, "reward": 0.90625, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 723 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.3444321679928155e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.0802867383512544, "grad_norm": 0.12595960129785244, "learning_rate": 1.68693852045111e-06, "loss": 0.0, "num_tokens": 61475702.0, "reward": 0.78125, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 724 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917366402546925e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.0831541218637994, "grad_norm": 0.14328105313741246, "learning_rate": 1.6861203765204508e-06, "loss": 0.0, "num_tokens": 61546127.0, "reward": 0.8046875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 725 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.2674839170101503e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.086021505376344, "grad_norm": 0.0811799210414726, "learning_rate": 1.685301363891057e-06, "loss": 0.0, "num_tokens": 61636503.0, "reward": 0.7109375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 726 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344227436572988e-09, "advantages/std": 0.5228010416030884, "advantages/var": 0.27332092910127415, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 2.088888888888889, "grad_norm": 0.10775212757040224, "learning_rate": 1.6844814835998825e-06, "loss": 0.0, "num_tokens": 61730092.0, "reward": 0.703125, "reward_std": 0.15019109845161438, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 727 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9916557799070046e-09, "advantages/std": 0.46761220693588257, "advantages/var": 0.21866117607544666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.0917562724014336, "grad_norm": 0.1057920850104567, "learning_rate": 1.6836607366849796e-06, "loss": -0.0, "num_tokens": 61817106.0, "reward": 0.8046875, "reward_std": 0.1236182376742363, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 728 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.0946236559139786, "grad_norm": 0.06764032154212377, "learning_rate": 1.6828391241854981e-06, "loss": 0.0, "num_tokens": 61894528.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 729 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.6722160839964078e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 2.097491039426523, "grad_norm": 0.14158381613631227, "learning_rate": 1.6820166471416841e-06, "loss": -0.0, "num_tokens": 61984766.0, "reward": 0.640625, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 730 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.9792225930357895e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.100358422939068, "grad_norm": 0.13341958124051298, "learning_rate": 1.6811933065948773e-06, "loss": -0.0, "num_tokens": 62066304.0, "reward": 0.8125, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 731 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.1032258064516127, "grad_norm": 0.10447308299187771, "learning_rate": 1.6803691035875117e-06, "loss": 0.0, "num_tokens": 62144821.0, "reward": 0.796875, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 732 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.1060931899641577, "grad_norm": 0.1208101716876214, "learning_rate": 1.6795440391631122e-06, "loss": -0.0, "num_tokens": 62220125.0, "reward": 0.9453125, "reward_std": 0.12415502220392227, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 733 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 8.131125327555407e-09, "advantages/std": 0.5726898312568665, "advantages/var": 0.3279736428250182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.1089605734767023, "grad_norm": 0.11976838424289314, "learning_rate": 1.6787181143662953e-06, "loss": 0.0, "num_tokens": 62305754.0, "reward": 0.75, "reward_std": 0.16675157845020294, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 734 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.520874077163261e-09, "advantages/std": 0.661286473274231, "advantages/var": 0.4372997997354702, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "epoch": 2.1118279569892473, "grad_norm": 0.116480809225144, "learning_rate": 1.6778913302427666e-06, "loss": -0.0, "num_tokens": 62397252.0, "reward": 0.625, "reward_std": 0.22567126154899597, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 735 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262335283006049e-09, "advantages/std": 0.5726868510246277, "advantages/var": 0.3279702293365041, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "epoch": 2.1146953405017923, "grad_norm": 0.1915343604730967, "learning_rate": 1.6770636878393191e-06, "loss": -0.0, "num_tokens": 62491536.0, "reward": 0.640625, "reward_std": 0.16545340418815613, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 736 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599796258942519e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 2.117562724014337, "grad_norm": 0.11750821923740856, "learning_rate": 1.676235188203834e-06, "loss": -0.0, "num_tokens": 62570395.0, "reward": 0.796875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 737 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.120430107526882, "grad_norm": 0.0702618672546427, "learning_rate": 1.6754058323852753e-06, "loss": -0.0, "num_tokens": 62650546.0, "reward": 0.8515625, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 738 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917366402546925e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.1232974910394264, "grad_norm": 0.08698472471690182, "learning_rate": 1.6745756214336934e-06, "loss": 0.0, "num_tokens": 62729753.0, "reward": 0.8359375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 739 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.394200364231044e-08, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.1261648745519715, "grad_norm": 0.12130414909496527, "learning_rate": 1.6737445564002203e-06, "loss": -0.0, "num_tokens": 62802801.0, "reward": 0.8828125, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 740 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.5995914574054795e-09, "advantages/std": 0.4049588143825531, "advantages/var": 0.1639916413461231, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.129032258064516, "grad_norm": 0.09371527079704135, "learning_rate": 1.6729126383370696e-06, "loss": 0.0, "num_tokens": 62887489.0, "reward": 0.671875, "reward_std": 0.0867956355214119, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 741 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.504997077293582e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.131899641577061, "grad_norm": 0.09466306667377959, "learning_rate": 1.6720798682975348e-06, "loss": 0.0, "num_tokens": 62972014.0, "reward": 0.703125, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 742 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966544250856589e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.1347670250896056, "grad_norm": 0.028094443208474578, "learning_rate": 1.6712462473359876e-06, "loss": 0.0, "num_tokens": 63040350.0, "reward": 0.9140625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 743 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 2.1376344086021506, "grad_norm": 0.0827406576698606, "learning_rate": 1.6704117765078787e-06, "loss": 0.0, "num_tokens": 63118845.0, "reward": 0.734375, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 744 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.065623173308489e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.140501792114695, "grad_norm": 0.11513769535302304, "learning_rate": 1.6695764568697328e-06, "loss": 0.0, "num_tokens": 63207076.0, "reward": 0.8125, "reward_std": 0.15650182962417603, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 745 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.95862671130252e-10, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.14336917562724, "grad_norm": 0.14039636884747167, "learning_rate": 1.6687402894791506e-06, "loss": 0.0, "num_tokens": 63297581.0, "reward": 0.765625, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 746 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917335935919526e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.146236559139785, "grad_norm": 0.10127409709172366, "learning_rate": 1.6679032753948055e-06, "loss": -0.0, "num_tokens": 63383169.0, "reward": 0.8125, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 747 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.252531375408196e-09, "advantages/std": 0.5726755261421204, "advantages/var": 0.3279572582421544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.1491039426523297, "grad_norm": 0.24271622070952423, "learning_rate": 1.6670654156764436e-06, "loss": 0.0, "num_tokens": 63466897.0, "reward": 0.78125, "reward_std": 0.15072786808013916, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 748 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516804898616483e-09, "advantages/std": 0.6185716986656189, "advantages/var": 0.3826309463900692, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 2.1519713261648747, "grad_norm": 0.09495054844683182, "learning_rate": 1.6662267113848806e-06, "loss": 0.0, "num_tokens": 63568628.0, "reward": 0.78125, "reward_std": 0.18990950286388397, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 749 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.516801416758646e-09, "advantages/std": 0.6185721755027771, "advantages/var": 0.3826315363062385, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.1548387096774193, "grad_norm": 0.11483586687460479, "learning_rate": 1.6653871635820026e-06, "loss": -0.0, "num_tokens": 63658107.0, "reward": 0.7734375, "reward_std": 0.19097033143043518, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 750 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975166506069069e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.1577060931899643, "grad_norm": 0.0819964801308947, "learning_rate": 1.6645467733307628e-06, "loss": 0.0, "num_tokens": 63741468.0, "reward": 0.8046875, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 751 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907227504745508e-10, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.160573476702509, "grad_norm": 0.12539445836552948, "learning_rate": 1.6637055416951817e-06, "loss": -0.0, "num_tokens": 63818006.0, "reward": 0.84375, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 752 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975166506069069e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.163440860215054, "grad_norm": 0.10930395045054528, "learning_rate": 1.6628634697403444e-06, "loss": 0.0, "num_tokens": 63896332.0, "reward": 0.8671875, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 753 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.1125794765316845e-09, "advantages/std": 0.6612692475318909, "advantages/var": 0.43727701773139316, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 2.1663082437275984, "grad_norm": 0.14638189914691208, "learning_rate": 1.6620205585324013e-06, "loss": 0.0, "num_tokens": 63990639.0, "reward": 0.75, "reward_std": 0.20069602131843567, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 754 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 9.858529166555944e-09, "advantages/std": 0.6612809896469116, "advantages/var": 0.43729254726839883, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.1691756272401435, "grad_norm": 0.15347012325541226, "learning_rate": 1.6611768091385629e-06, "loss": 0.0, "num_tokens": 64072599.0, "reward": 0.8203125, "reward_std": 0.2188364714384079, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 755 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.0954734451444e-08, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.172043010752688, "grad_norm": 0.10962415186479377, "learning_rate": 1.6603322226271038e-06, "loss": 0.0, "num_tokens": 64153799.0, "reward": 0.953125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 756 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516894993554525e-09, "advantages/std": 0.6185593605041504, "advantages/var": 0.3826156824673035, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.174910394265233, "grad_norm": 0.15211530736200046, "learning_rate": 1.659486800067356e-06, "loss": 0.0, "num_tokens": 64236188.0, "reward": 0.7890625, "reward_std": 0.17517907917499542, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 757 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.4497441148988883e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.1777777777777776, "grad_norm": 0.12091419499987931, "learning_rate": 1.658640542529712e-06, "loss": 0.0, "num_tokens": 64314952.0, "reward": 0.859375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 758 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.09543064605741e-08, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 2.1806451612903226, "grad_norm": 0.06596850187666084, "learning_rate": 1.65779345108562e-06, "loss": 0.0, "num_tokens": 64405738.0, "reward": 0.7421875, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 759 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.749445740229558e-09, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.183512544802867, "grad_norm": 0.1014149366216826, "learning_rate": 1.6569455268075853e-06, "loss": 0.0, "num_tokens": 64479877.0, "reward": 0.8359375, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 760 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.186379928315412, "grad_norm": 0.05411609763188232, "learning_rate": 1.656096770769166e-06, "loss": 0.0, "num_tokens": 64567374.0, "reward": 0.8203125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 761 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633244590331673e-09, "advantages/std": 0.33065226674079895, "advantages/var": 0.10933092150082846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.189247311827957, "grad_norm": 0.057790587081550625, "learning_rate": 1.6552471840449752e-06, "loss": -0.0, "num_tokens": 64647877.0, "reward": 0.765625, "reward_std": 0.06233953312039375, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 762 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 9.797994938980706e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.1921146953405017, "grad_norm": 0.09229350004209458, "learning_rate": 1.6543967677106765e-06, "loss": 0.0, "num_tokens": 64725739.0, "reward": 0.8671875, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 763 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.1949820788530467, "grad_norm": 0.08741759453207316, "learning_rate": 1.6535455228429838e-06, "loss": -0.0, "num_tokens": 64804835.0, "reward": 0.78125, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 764 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.4393350594456193e-09, "advantages/std": 0.5726904273033142, "advantages/var": 0.3279743255248526, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.1978494623655913, "grad_norm": 0.10247069099104299, "learning_rate": 1.6526934505196605e-06, "loss": 0.0, "num_tokens": 64893347.0, "reward": 0.8515625, "reward_std": 0.16781240701675415, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 765 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.0112830604957173e-09, "advantages/std": 0.6185553073883057, "advantages/var": 0.3826106682982413, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.2007168458781363, "grad_norm": 0.14126744166687255, "learning_rate": 1.6518405518195175e-06, "loss": -0.0, "num_tokens": 64982653.0, "reward": 0.7890625, "reward_std": 0.16834919154644012, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 766 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.970845876775471e-09, "advantages/std": 0.46760880947113037, "advantages/var": 0.2186579986950079, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.203584229390681, "grad_norm": 0.08696000030980688, "learning_rate": 1.6509868278224124e-06, "loss": -0.0, "num_tokens": 65067351.0, "reward": 0.7890625, "reward_std": 0.11914245784282684, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 767 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.206451612903226, "grad_norm": 0.059580949505776214, "learning_rate": 1.6501322796092468e-06, "loss": 0.0, "num_tokens": 65137747.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 768 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.975157366213667e-09, "advantages/std": 0.4675973355770111, "advantages/var": 0.21864726823871994, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.2093189964157705, "grad_norm": 0.12920441443676584, "learning_rate": 1.6492769082619669e-06, "loss": 0.0, "num_tokens": 65222576.0, "reward": 0.796875, "reward_std": 0.11100947856903076, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 769 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.053907636466336e-08, "advantages/std": 0.618579626083374, "advantages/var": 0.3826407538054468, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.2121863799283155, "grad_norm": 0.17369470199564782, "learning_rate": 1.64842071486356e-06, "loss": 0.0, "num_tokens": 65309214.0, "reward": 0.765625, "reward_std": 0.2001592218875885, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 770 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344456541825744e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 2.21505376344086, "grad_norm": 0.12310396305131238, "learning_rate": 1.6475637004980545e-06, "loss": -0.0, "num_tokens": 65383541.0, "reward": 0.8515625, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 771 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6262555319655851e-09, "advantages/std": 0.5726791024208069, "advantages/var": 0.327961354349501, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.217921146953405, "grad_norm": 0.13788401845903062, "learning_rate": 1.6467058662505193e-06, "loss": -0.0, "num_tokens": 65457069.0, "reward": 0.8203125, "reward_std": 0.15308690071105957, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 772 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 8.049402934764073e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.22078853046595, "grad_norm": 0.08234956359655557, "learning_rate": 1.6458472132070598e-06, "loss": 0.0, "num_tokens": 65542338.0, "reward": 0.71875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 773 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5056509608456243e-09, "advantages/std": 0.6185514330863953, "advantages/var": 0.3826058753732333, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.2236559139784946, "grad_norm": 0.18533237301540315, "learning_rate": 1.6449877424548191e-06, "loss": -0.0, "num_tokens": 65627174.0, "reward": 0.7578125, "reward_std": 0.1649293303489685, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 774 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.2998246708054356e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.2265232974910396, "grad_norm": 0.14678834834025797, "learning_rate": 1.6441274550819752e-06, "loss": 0.0, "num_tokens": 65718040.0, "reward": 0.7265625, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 775 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9876548503938182e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.229390681003584, "grad_norm": 0.13079457903392905, "learning_rate": 1.6432663521777398e-06, "loss": 0.0, "num_tokens": 65790063.0, "reward": 0.890625, "reward_std": 0.09863808006048203, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 776 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149905988405531e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.232258064516129, "grad_norm": 0.07412815956067396, "learning_rate": 1.642404434832358e-06, "loss": 0.0, "num_tokens": 65863602.0, "reward": 0.8046875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 777 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125942055767658e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.2351254480286737, "grad_norm": 0.07015696451372215, "learning_rate": 1.6415417041371052e-06, "loss": 0.0, "num_tokens": 65949550.0, "reward": 0.8984375, "reward_std": 0.12415502220392227, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 778 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125909557323754e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.2379928315412188, "grad_norm": 0.08181718966797533, "learning_rate": 1.640678161184287e-06, "loss": 0.0, "num_tokens": 66024603.0, "reward": 0.9375, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 779 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.4536081669351505e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.2408602150537633, "grad_norm": 0.12480215557562097, "learning_rate": 1.6398138070672372e-06, "loss": -0.0, "num_tokens": 66091063.0, "reward": 0.8515625, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 780 }, { "advantages/mean": 9.313225746154785e-09, "advantages/snr": 1.9916359783736918e-08, "advantages/std": 0.46761685609817505, "advantages/var": 0.21866552410714135, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.2437275985663083, "grad_norm": 0.0789649925885418, "learning_rate": 1.638948642880317e-06, "loss": -0.0, "num_tokens": 66163966.0, "reward": 0.84375, "reward_std": 0.1293872892856598, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 781 }, { "advantages/mean": 5.122274160385132e-09, "advantages/snr": 1.2649061754629257e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.246594982078853, "grad_norm": 0.11120191454022281, "learning_rate": 1.6380826697189126e-06, "loss": 0.0, "num_tokens": 66236604.0, "reward": 0.859375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 782 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958460418703728e-10, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.249462365591398, "grad_norm": 0.10790084580548243, "learning_rate": 1.6372158886794348e-06, "loss": 0.0, "num_tokens": 66311793.0, "reward": 0.9140625, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 783 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.691850523553041e-09, "advantages/std": 0.5726835131645203, "advantages/var": 0.32796640625045725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.252329749103943, "grad_norm": 0.11999164653187376, "learning_rate": 1.6363483008593175e-06, "loss": 0.0, "num_tokens": 66394471.0, "reward": 0.7578125, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 784 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.2551971326164875, "grad_norm": 0.054776655410849015, "learning_rate": 1.6354799073570158e-06, "loss": -0.0, "num_tokens": 66471747.0, "reward": 0.7109375, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 785 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.258064516129032, "grad_norm": 0.06885862272421579, "learning_rate": 1.6346107092720047e-06, "loss": 0.0, "num_tokens": 66543060.0, "reward": 0.8125, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 786 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917811987622486e-09, "advantages/std": 0.4675827622413635, "advantages/var": 0.2186336395452635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.260931899641577, "grad_norm": 0.09426756429735036, "learning_rate": 1.6337407077047783e-06, "loss": -0.0, "num_tokens": 66619983.0, "reward": 0.8046875, "reward_std": 0.09522313624620438, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 787 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450174584180815e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.263799283154122, "grad_norm": 0.07220149653500607, "learning_rate": 1.6328699037568477e-06, "loss": 0.0, "num_tokens": 66703084.0, "reward": 0.84375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 788 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 8.049341918838717e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.2666666666666666, "grad_norm": 0.07166662293294265, "learning_rate": 1.6319982985307398e-06, "loss": -0.0, "num_tokens": 66781970.0, "reward": 0.7109375, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 789 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.599467908273411e-09, "advantages/std": 0.4049696922302246, "advantages/var": 0.16400045162504284, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 2.2695340501792116, "grad_norm": 0.0901087122796034, "learning_rate": 1.6311258931299962e-06, "loss": 0.0, "num_tokens": 66871490.0, "reward": 0.7109375, "reward_std": 0.09916213154792786, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 790 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.294568831333326e-08, "advantages/std": 0.46761488914489746, "advantages/var": 0.21866368454999474, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.272401433691756, "grad_norm": 0.168916034453994, "learning_rate": 1.6302526886591718e-06, "loss": 0.0, "num_tokens": 66953245.0, "reward": 0.6875, "reward_std": 0.12703317403793335, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 791 }, { "advantages/mean": -9.778887033462524e-09, "advantages/snr": 1.580873184003523e-08, "advantages/std": 0.6185750365257263, "advantages/var": 0.38263507581280365, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.275268817204301, "grad_norm": 0.1588000831481918, "learning_rate": 1.6293786862238331e-06, "loss": 0.0, "num_tokens": 67038538.0, "reward": 0.6328125, "reward_std": 0.192268505692482, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 792 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 6.971194650918909e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.2781362007168457, "grad_norm": 0.11748585850412334, "learning_rate": 1.6285038869305564e-06, "loss": -0.0, "num_tokens": 67114899.0, "reward": 0.78125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 793 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.2810035842293908, "grad_norm": 0.04012524798369861, "learning_rate": 1.6276282918869273e-06, "loss": 0.0, "num_tokens": 67196480.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 794 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.0112830604957173e-09, "advantages/std": 0.6185553073883057, "advantages/var": 0.3826106682982413, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 2.2838709677419353, "grad_norm": 0.1256492011230556, "learning_rate": 1.6267519022015393e-06, "loss": 0.0, "num_tokens": 67282663.0, "reward": 0.7578125, "reward_std": 0.16834919154644012, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 795 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.2596798670633243e-09, "advantages/std": 0.7393327355384827, "advantages/var": 0.546612893838816, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.2867383512544803, "grad_norm": 0.18596069133345994, "learning_rate": 1.625874718983991e-06, "loss": 0.0, "num_tokens": 67373465.0, "reward": 0.8125, "reward_std": 0.2698654532432556, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 796 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814432667740602e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.289605734767025, "grad_norm": 0.13332228534824503, "learning_rate": 1.6249967433448867e-06, "loss": -0.0, "num_tokens": 67453638.0, "reward": 0.6640625, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 797 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983400669593257e-09, "advantages/std": 0.4676017463207245, "advantages/var": 0.21865139316219118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.29247311827957, "grad_norm": 0.10214177942000607, "learning_rate": 1.6241179763958331e-06, "loss": 0.0, "num_tokens": 67522550.0, "reward": 0.8671875, "reward_std": 0.11336850374937057, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 798 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.252515803253119e-09, "advantages/std": 0.57267826795578, "advantages/var": 0.3279603985888322, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.295340501792115, "grad_norm": 0.10592730340612073, "learning_rate": 1.62323841924944e-06, "loss": 0.0, "num_tokens": 67618155.0, "reward": 0.75, "reward_std": 0.1552036553621292, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 799 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.987466533846137e-09, "advantages/std": 0.46761488914489746, "advantages/var": 0.21866368454999474, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.2982078853046595, "grad_norm": 0.08604654935892782, "learning_rate": 1.6223580730193166e-06, "loss": 0.0, "num_tokens": 67703705.0, "reward": 0.734375, "reward_std": 0.12703317403793335, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 800 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016361261689808e-09, "advantages/std": 0.5227997303009033, "advantages/var": 0.27331955800269725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.3010752688172045, "grad_norm": 0.11778978963362384, "learning_rate": 1.6214769388200709e-06, "loss": -0.0, "num_tokens": 67777762.0, "reward": 0.828125, "reward_std": 0.14806944131851196, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 801 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.8787427301770344e-09, "advantages/std": 0.5726819038391113, "advantages/var": 0.32796456298478915, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.303942652329749, "grad_norm": 0.1344213731837575, "learning_rate": 1.62059501776731e-06, "loss": -0.0, "num_tokens": 67857518.0, "reward": 0.7578125, "reward_std": 0.15756267309188843, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 802 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.5933270610653542e-08, "advantages/std": 0.46761149168014526, "advantages/var": 0.21866050715133056, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.306810035842294, "grad_norm": 0.16060446403631687, "learning_rate": 1.6197123109776358e-06, "loss": 0.0, "num_tokens": 67939276.0, "reward": 0.796875, "reward_std": 0.12255740165710449, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 803 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5627630490351976e-09, "advantages/std": 0.5228091478347778, "advantages/var": 0.2733294050597266, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.3096774193548386, "grad_norm": 0.08810243865681089, "learning_rate": 1.6188288195686457e-06, "loss": 0.0, "num_tokens": 68025822.0, "reward": 0.796875, "reward_std": 0.15831917524337769, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 804 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.5628597236829876e-09, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.3125448028673836, "grad_norm": 0.14997348940396277, "learning_rate": 1.6179445446589307e-06, "loss": 0.0, "num_tokens": 68106382.0, "reward": 0.703125, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 805 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.315412186379928, "grad_norm": 0.020152598229408337, "learning_rate": 1.617059487368073e-06, "loss": 0.0, "num_tokens": 68190285.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 806 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.505094572484785e-09, "advantages/std": 0.5726727247238159, "advantages/var": 0.32795404964259944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.318279569892473, "grad_norm": 0.14840773328111326, "learning_rate": 1.616173648816646e-06, "loss": 0.0, "num_tokens": 68268330.0, "reward": 0.9375, "reward_std": 0.1462521106004715, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 807 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814513910737996e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.3211469534050178, "grad_norm": 0.08673907543339605, "learning_rate": 1.6152870301262125e-06, "loss": -0.0, "num_tokens": 68344121.0, "reward": 0.875, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 808 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.9874667242452167e-09, "advantages/std": 0.4676148593425751, "advantages/var": 0.21866365667797627, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.3240143369175628, "grad_norm": 0.10158509615616124, "learning_rate": 1.6143996324193223e-06, "loss": -0.0, "num_tokens": 68437027.0, "reward": 0.640625, "reward_std": 0.12703317403793335, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 809 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 2.3268817204301078, "grad_norm": 0.09161765227577882, "learning_rate": 1.613511456819512e-06, "loss": 0.0, "num_tokens": 68524081.0, "reward": 0.7578125, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 810 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917813257124117e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.3297491039426523, "grad_norm": 0.10019493776074073, "learning_rate": 1.6126225044513029e-06, "loss": 0.0, "num_tokens": 68601585.0, "reward": 0.8203125, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 811 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.781509278854418e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.332616487455197, "grad_norm": 0.10708769991937248, "learning_rate": 1.6117327764401995e-06, "loss": -0.0, "num_tokens": 68683203.0, "reward": 0.84375, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 812 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33063647150993347, "advantages/var": 0.10932047629253905, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.335483870967742, "grad_norm": 0.05251976644698296, "learning_rate": 1.6108422739126893e-06, "loss": -0.0, "num_tokens": 68752714.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 813 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917335935919526e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.703125, "epoch": 2.338351254480287, "grad_norm": 0.06283725007481095, "learning_rate": 1.6099509979962393e-06, "loss": -0.0, "num_tokens": 68846524.0, "reward": 0.5, "reward_std": 0.1065337136387825, "rewards/drgrpo_math_reward/mean": 0.5, "rewards/drgrpo_math_reward/std": 0.5019646286964417, "step": 814 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.3412186379928315, "grad_norm": 0.05613515715843307, "learning_rate": 1.6090589498192969e-06, "loss": 0.0, "num_tokens": 68929137.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 815 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.0570579543741361e-08, "advantages/std": 0.5726835131645203, "advantages/var": 0.32796640625045725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.3440860215053765, "grad_norm": 0.1734403360847045, "learning_rate": 1.6081661305112855e-06, "loss": 0.0, "num_tokens": 69003472.0, "reward": 0.9140625, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 816 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199317639730369e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.346953405017921, "grad_norm": 0.08192004340641214, "learning_rate": 1.6072725412026065e-06, "loss": 0.0, "num_tokens": 69075836.0, "reward": 0.78125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 817 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983384167481491e-09, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.349820788530466, "grad_norm": 0.10030683878884093, "learning_rate": 1.6063781830246355e-06, "loss": -0.0, "num_tokens": 69165604.0, "reward": 0.7890625, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 818 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.3526881720430106, "grad_norm": 0.14778584127973038, "learning_rate": 1.6054830571097214e-06, "loss": 0.0, "num_tokens": 69239182.0, "reward": 0.9296875, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 819 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.3555555555555556, "grad_norm": 0.02702641973921866, "learning_rate": 1.6045871645911859e-06, "loss": 0.0, "num_tokens": 69313409.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 820 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958222414150722e-10, "advantages/std": 0.4676148593425751, "advantages/var": 0.21866365667797627, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.3584229390681, "grad_norm": 0.10873077079893656, "learning_rate": 1.6036905066033205e-06, "loss": -0.0, "num_tokens": 69392487.0, "reward": 0.890625, "reward_std": 0.12703317403793335, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 821 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125813688641252e-09, "advantages/std": 0.5227880477905273, "advantages/var": 0.2733073429126307, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.361290322580645, "grad_norm": 0.09043236914308421, "learning_rate": 1.6027930842813857e-06, "loss": 0.0, "num_tokens": 69471670.0, "reward": 0.8046875, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 822 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.065478038084407e-09, "advantages/std": 0.5727017521858215, "advantages/var": 0.32798729695671014, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.3641577060931898, "grad_norm": 0.21648865050194732, "learning_rate": 1.6018948987616105e-06, "loss": 0.0, "num_tokens": 69555442.0, "reward": 0.7890625, "reward_std": 0.18253791332244873, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 823 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 7.318086675313809e-09, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.3670250896057348, "grad_norm": 0.15665828090321152, "learning_rate": 1.6009959511811903e-06, "loss": 0.0, "num_tokens": 69635152.0, "reward": 0.8125, "reward_std": 0.16097761690616608, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 824 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.62618106042536e-09, "advantages/std": 0.5727053284645081, "advantages/var": 0.32799139325164006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.3698924731182798, "grad_norm": 0.17779721014436148, "learning_rate": 1.6000962426782841e-06, "loss": -0.0, "num_tokens": 69723627.0, "reward": 0.78125, "reward_std": 0.18489694595336914, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 825 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.394213515514367e-08, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.3727598566308243, "grad_norm": 0.10454738836965104, "learning_rate": 1.5991957743920157e-06, "loss": -0.0, "num_tokens": 69798447.0, "reward": 0.890625, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 826 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.528111170810656e-10, "advantages/std": 0.6185632348060608, "advantages/var": 0.3826204754537379, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.3756272401433693, "grad_norm": 0.14756827033506406, "learning_rate": 1.59829454746247e-06, "loss": 0.0, "num_tokens": 69874364.0, "reward": 0.7734375, "reward_std": 0.17859894037246704, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 827 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.4083979969389336e-09, "advantages/std": 0.6612637639045715, "advantages/var": 0.4372697654532409, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.378494623655914, "grad_norm": 0.17218708550366563, "learning_rate": 1.5973925630306928e-06, "loss": 0.0, "num_tokens": 69953901.0, "reward": 0.8515625, "reward_std": 0.1938612163066864, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 828 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975343977137235e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.381362007168459, "grad_norm": 0.0739302193567203, "learning_rate": 1.5964898222386886e-06, "loss": -0.0, "num_tokens": 70030525.0, "reward": 0.9140625, "reward_std": 0.09522313624620438, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 829 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599795920412534e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.3842293906810035, "grad_norm": 0.1012210466032213, "learning_rate": 1.5955863262294203e-06, "loss": 0.0, "num_tokens": 70111438.0, "reward": 0.796875, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 830 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 2.3870967741935485, "grad_norm": 0.052153777357613056, "learning_rate": 1.5946820761468058e-06, "loss": 0.0, "num_tokens": 70199376.0, "reward": 0.6640625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 831 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.389964157706093, "grad_norm": 0.04994625248728887, "learning_rate": 1.5937770731357189e-06, "loss": -0.0, "num_tokens": 70284417.0, "reward": 0.828125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 832 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 8.944284405653326e-09, "advantages/std": 0.5726868510246277, "advantages/var": 0.3279702293365041, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.392831541218638, "grad_norm": 0.08601123088625985, "learning_rate": 1.5928713183419857e-06, "loss": -0.0, "num_tokens": 70368337.0, "reward": 0.84375, "reward_std": 0.16545338928699493, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 833 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 6.298515617681381e-09, "advantages/std": 0.7393190860748291, "advantages/var": 0.5465927110345206, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.3956989247311826, "grad_norm": 0.16615908431907986, "learning_rate": 1.5919648129123854e-06, "loss": 0.0, "num_tokens": 70447518.0, "reward": 0.7734375, "reward_std": 0.24830512702465057, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 834 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.3985663082437276, "grad_norm": 0.07375274258284313, "learning_rate": 1.5910575579946462e-06, "loss": 0.0, "num_tokens": 70519602.0, "reward": 0.84375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 835 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.9753097007876364e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.4014336917562726, "grad_norm": 0.10647210029597416, "learning_rate": 1.590149554737446e-06, "loss": 0.0, "num_tokens": 70590978.0, "reward": 0.84375, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 836 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962665216109293e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.404301075268817, "grad_norm": 0.08731420444068373, "learning_rate": 1.5892408042904097e-06, "loss": -0.0, "num_tokens": 70678189.0, "reward": 0.828125, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 837 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975166506069069e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.4071684587813618, "grad_norm": 0.07912848658835807, "learning_rate": 1.5883313078041092e-06, "loss": 0.0, "num_tokens": 70761137.0, "reward": 0.6796875, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 838 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 1.0349462367204652e-08, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.4100358422939068, "grad_norm": 0.07558171588476088, "learning_rate": 1.5874210664300598e-06, "loss": -0.0, "num_tokens": 70840232.0, "reward": 0.7265625, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 839 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 5.633261853378446e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.412903225806452, "grad_norm": 0.0658563126895547, "learning_rate": 1.5865100813207204e-06, "loss": -0.0, "num_tokens": 70918041.0, "reward": 0.8203125, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 840 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814306742559065e-09, "advantages/std": 0.5227947235107422, "advantages/var": 0.27331432293067337, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.4157706093189963, "grad_norm": 0.09599558968393125, "learning_rate": 1.585598353629492e-06, "loss": -0.0, "num_tokens": 71005770.0, "reward": 0.828125, "reward_std": 0.14441713690757751, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 841 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.991736767199176e-09, "advantages/std": 0.4675931930541992, "advantages/var": 0.21864339419062162, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.4186379928315414, "grad_norm": 0.09228833027736225, "learning_rate": 1.5846858845107146e-06, "loss": 0.0, "num_tokens": 71095368.0, "reward": 0.7109375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 842 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199247907244247e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.421505376344086, "grad_norm": 0.0888693255334843, "learning_rate": 1.5837726751196678e-06, "loss": 0.0, "num_tokens": 71179062.0, "reward": 0.8359375, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 843 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.4083785526452889e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.424372759856631, "grad_norm": 0.08783138167701011, "learning_rate": 1.582858726612569e-06, "loss": -0.0, "num_tokens": 71255646.0, "reward": 0.7421875, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 844 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.4272401433691755, "grad_norm": 0.04169677356932018, "learning_rate": 1.58194404014657e-06, "loss": 0.0, "num_tokens": 71336082.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 845 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.13154848685047e-09, "advantages/std": 0.5726600289344788, "advantages/var": 0.32793950873923805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.4301075268817205, "grad_norm": 0.17353269166094223, "learning_rate": 1.581028616879758e-06, "loss": 0.0, "num_tokens": 71406830.0, "reward": 0.78125, "reward_std": 0.13258251547813416, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 846 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.432974910394265, "grad_norm": 0.11219877687026668, "learning_rate": 1.5801124579711524e-06, "loss": -0.0, "num_tokens": 71478282.0, "reward": 0.921875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 847 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 6.337738429595054e-09, "advantages/std": 0.6612692475318909, "advantages/var": 0.43727701773139316, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.43584229390681, "grad_norm": 0.17166820647525638, "learning_rate": 1.5791955645807047e-06, "loss": 0.0, "num_tokens": 71567252.0, "reward": 0.75, "reward_std": 0.20069602131843567, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 848 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.4387096774193546, "grad_norm": 0.07494762996346009, "learning_rate": 1.5782779378692954e-06, "loss": -0.0, "num_tokens": 71657155.0, "reward": 0.8515625, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 849 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.7495735248314805e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.4415770609318996, "grad_norm": 0.0867713706613162, "learning_rate": 1.5773595789987347e-06, "loss": -0.0, "num_tokens": 71744493.0, "reward": 0.71875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 850 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.5628702850968443e-09, "advantages/std": 0.5227934122085571, "advantages/var": 0.27331295184866633, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 2.4444444444444446, "grad_norm": 0.08048893955625597, "learning_rate": 1.5764404891317582e-06, "loss": 0.0, "num_tokens": 71832101.0, "reward": 0.75, "reward_std": 0.1422954797744751, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 851 }, { "advantages/mean": 5.122274160385132e-09, "advantages/snr": 9.79780726992476e-09, "advantages/std": 0.5227980017662048, "advantages/var": 0.2733177506507367, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.447311827956989, "grad_norm": 0.1405487397602274, "learning_rate": 1.5755206694320284e-06, "loss": -0.0, "num_tokens": 71913348.0, "reward": 0.703125, "reward_std": 0.14571532607078552, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 852 }, { "advantages/mean": 5.122274160385132e-09, "advantages/snr": 1.09543064605741e-08, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.450179211469534, "grad_norm": 0.10916833852927496, "learning_rate": 1.5746001210641315e-06, "loss": -0.0, "num_tokens": 71995750.0, "reward": 0.8984375, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 853 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.4530465949820788, "grad_norm": 0.09746033420380783, "learning_rate": 1.5736788451935761e-06, "loss": 0.0, "num_tokens": 72067809.0, "reward": 0.7890625, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 854 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299897960206267e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.455913978494624, "grad_norm": 0.07552149181874301, "learning_rate": 1.572756842986791e-06, "loss": 0.0, "num_tokens": 72148314.0, "reward": 0.796875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 855 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344336502847305e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.4587813620071683, "grad_norm": 0.12037563104289, "learning_rate": 1.5718341156111266e-06, "loss": 0.0, "num_tokens": 72218695.0, "reward": 0.84375, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 856 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.4616487455197134, "grad_norm": 0.037175842241328985, "learning_rate": 1.57091066423485e-06, "loss": -0.0, "num_tokens": 72304989.0, "reward": 0.859375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 857 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975300560494157e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.464516129032258, "grad_norm": 0.096446285201923, "learning_rate": 1.5699864900271452e-06, "loss": 0.0, "num_tokens": 72391234.0, "reward": 0.7734375, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 858 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.979230209351863e-09, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.467383512544803, "grad_norm": 0.10407527655496832, "learning_rate": 1.5690615941581116e-06, "loss": 0.0, "num_tokens": 72467982.0, "reward": 0.7734375, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 859 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983467187183905e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.4702508960573475, "grad_norm": 0.08420394392631414, "learning_rate": 1.5681359777987616e-06, "loss": 0.0, "num_tokens": 72548772.0, "reward": 0.828125, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 860 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.4083459495941717e-09, "advantages/std": 0.6612882018089294, "advantages/var": 0.4373020858516874, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.4731182795698925, "grad_norm": 0.13591374215815513, "learning_rate": 1.5672096421210218e-06, "loss": -0.0, "num_tokens": 72638390.0, "reward": 0.6171875, "reward_std": 0.22567616403102875, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.4879830479621887, "step": 861 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.4759856630824375, "grad_norm": 0.10533486860728158, "learning_rate": 1.5662825882977267e-06, "loss": 0.0, "num_tokens": 72727295.0, "reward": 0.765625, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 862 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899474012416308e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.478853046594982, "grad_norm": 0.07881267236540548, "learning_rate": 1.5653548175026223e-06, "loss": 0.0, "num_tokens": 72808598.0, "reward": 0.7578125, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 863 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.2649061754629257e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.481720430107527, "grad_norm": 0.07840275481089848, "learning_rate": 1.5644263309103612e-06, "loss": 0.0, "num_tokens": 72880030.0, "reward": 0.765625, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 864 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.1499403476539522e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.4845878136200716, "grad_norm": 0.108450197441348, "learning_rate": 1.5634971296965027e-06, "loss": -0.0, "num_tokens": 72959411.0, "reward": 0.8203125, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 865 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.749445740229558e-09, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 2.4874551971326166, "grad_norm": 0.07035031215141113, "learning_rate": 1.562567215037511e-06, "loss": -0.0, "num_tokens": 73041173.0, "reward": 0.6484375, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 866 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.1175438442211334e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.490322580645161, "grad_norm": 0.12188140921287076, "learning_rate": 1.5616365881107527e-06, "loss": -0.0, "num_tokens": 73132608.0, "reward": 0.7265625, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 867 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.493189964157706, "grad_norm": 0.08703212815397973, "learning_rate": 1.5607052500944975e-06, "loss": 0.0, "num_tokens": 73211769.0, "reward": 0.796875, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 868 }, { "advantages/mean": -7.916241884231567e-09, "advantages/snr": 1.3822759121218127e-08, "advantages/std": 0.5726962089538574, "advantages/var": 0.3279809477501203, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.496057347670251, "grad_norm": 0.093374326160837, "learning_rate": 1.559773202167915e-06, "loss": 0.0, "num_tokens": 73298035.0, "reward": 0.8203125, "reward_std": 0.17358636856079102, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 869 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9876049603820392e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.498924731182796, "grad_norm": 0.07205080915872775, "learning_rate": 1.5588404455110729e-06, "loss": -0.0, "num_tokens": 73380492.0, "reward": 0.6484375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 870 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878766595896755e-09, "advantages/std": 0.5726791024208069, "advantages/var": 0.327961354349501, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.5017921146953404, "grad_norm": 0.18283812648687808, "learning_rate": 1.557906981304937e-06, "loss": 0.0, "num_tokens": 73459739.0, "reward": 0.8359375, "reward_std": 0.15308690071105957, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 871 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.5047553830684996e-09, "advantages/std": 0.5727025866508484, "advantages/var": 0.3279882527565725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 2.5046594982078854, "grad_norm": 0.11614709433264811, "learning_rate": 1.5569728107313682e-06, "loss": -0.0, "num_tokens": 73544814.0, "reward": 0.703125, "reward_std": 0.1804211586713791, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 872 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.5075268817204304, "grad_norm": 0.06259624112532391, "learning_rate": 1.556037934973123e-06, "loss": 0.0, "num_tokens": 73623858.0, "reward": 0.796875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 873 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6721323019547286e-09, "advantages/std": 0.5227974057197571, "advantages/var": 0.2733171274273083, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.510394265232975, "grad_norm": 0.0982949417266809, "learning_rate": 1.5551023552138499e-06, "loss": 0.0, "num_tokens": 73705544.0, "reward": 0.7890625, "reward_std": 0.14465448260307312, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 874 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125782003796406e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.5132616487455195, "grad_norm": 0.11004084520606709, "learning_rate": 1.5541660726380884e-06, "loss": -0.0, "num_tokens": 73793032.0, "reward": 0.78125, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 875 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917668534980524e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.5161290322580645, "grad_norm": 0.0929457191484818, "learning_rate": 1.5532290884312677e-06, "loss": 0.0, "num_tokens": 73875683.0, "reward": 0.6484375, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 876 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.2523743065903652e-09, "advantages/std": 0.5727031826972961, "advantages/var": 0.32798893547161256, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.5189964157706095, "grad_norm": 0.13685780520987997, "learning_rate": 1.552291403779707e-06, "loss": -0.0, "num_tokens": 73966745.0, "reward": 0.7109375, "reward_std": 0.1814819872379303, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 877 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.521863799283154, "grad_norm": 0.07743984670080835, "learning_rate": 1.5513530198706103e-06, "loss": 0.0, "num_tokens": 74043948.0, "reward": 0.7109375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 878 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.3360778801984135e-08, "advantages/std": 0.5227928161621094, "advantages/var": 0.2733123286307091, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.524731182795699, "grad_norm": 0.1524430947405621, "learning_rate": 1.5504139378920687e-06, "loss": 0.0, "num_tokens": 74119575.0, "reward": 0.8515625, "reward_std": 0.1412346363067627, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 879 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6185494065284729, "advantages/var": 0.38260336831672603, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.5275985663082436, "grad_norm": 0.19977156677764607, "learning_rate": 1.5494741590330552e-06, "loss": 0.0, "num_tokens": 74196680.0, "reward": 0.859375, "reward_std": 0.16151440143585205, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 880 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.563018557708836e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.5304659498207887, "grad_norm": 0.10824165725563178, "learning_rate": 1.5485336844834272e-06, "loss": 0.0, "num_tokens": 74280381.0, "reward": 0.921875, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 881 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516905874828942e-09, "advantages/std": 0.618557870388031, "advantages/var": 0.38261383901897617, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.533333333333333, "grad_norm": 0.13516385807945042, "learning_rate": 1.5475925154339209e-06, "loss": 0.0, "num_tokens": 74362501.0, "reward": 0.7109375, "reward_std": 0.17282497882843018, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 882 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917668534980524e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.5362007168458782, "grad_norm": 0.0878478830412105, "learning_rate": 1.5466506530761535e-06, "loss": -0.0, "num_tokens": 74451427.0, "reward": 0.8359375, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 883 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199317639730369e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.539068100358423, "grad_norm": 0.09807090921005532, "learning_rate": 1.5457080986026193e-06, "loss": 0.0, "num_tokens": 74541539.0, "reward": 0.734375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 884 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.330655038356781, "advantages/var": 0.10933275439072432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 2.541935483870968, "grad_norm": 0.09864711259132558, "learning_rate": 1.5447648532066886e-06, "loss": -0.0, "num_tokens": 74630489.0, "reward": 0.625, "reward_std": 0.0646936446428299, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 885 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.691787729288785e-09, "advantages/std": 0.5726898312568665, "advantages/var": 0.3279736428250182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.5448028673835124, "grad_norm": 0.1495485160941214, "learning_rate": 1.543820918082607e-06, "loss": -0.0, "num_tokens": 74708218.0, "reward": 0.828125, "reward_std": 0.16675157845020294, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 886 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.749701315113695e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.5476702508960574, "grad_norm": 0.0590678300248879, "learning_rate": 1.5428762944254929e-06, "loss": -0.0, "num_tokens": 74784545.0, "reward": 0.6484375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 887 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.2348929130638486e-09, "advantages/std": 0.522804319858551, "advantages/var": 0.27332435686276213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.5505376344086024, "grad_norm": 0.14348185161722257, "learning_rate": 1.5419309834313366e-06, "loss": -0.0, "num_tokens": 74864179.0, "reward": 0.796875, "reward_std": 0.1514892876148224, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 888 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.3444254652277355e-09, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.553405017921147, "grad_norm": 0.09711082824675567, "learning_rate": 1.5409849862969993e-06, "loss": 0.0, "num_tokens": 74956343.0, "reward": 0.7734375, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 889 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562954778661877e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.5562724014336915, "grad_norm": 0.0991722264535798, "learning_rate": 1.5400383042202094e-06, "loss": -0.0, "num_tokens": 75043580.0, "reward": 0.765625, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 890 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.3443230978138685e-09, "advantages/std": 0.5227916836738586, "advantages/var": 0.2733111445185479, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.5591397849462365, "grad_norm": 0.1178474376865152, "learning_rate": 1.5390909383995645e-06, "loss": 0.0, "num_tokens": 75136495.0, "reward": 0.859375, "reward_std": 0.13994136452674866, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 891 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966544250856589e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.5620071684587815, "grad_norm": 0.038278747152186696, "learning_rate": 1.5381428900345264e-06, "loss": 0.0, "num_tokens": 75209995.0, "reward": 0.9765625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.9765625, "rewards/drgrpo_math_reward/std": 0.15188287198543549, "step": 892 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.564874551971326, "grad_norm": 0.08258340506919103, "learning_rate": 1.5371941603254212e-06, "loss": -0.0, "num_tokens": 75275390.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 893 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975067111642947e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.567741935483871, "grad_norm": 0.06255011662610364, "learning_rate": 1.5362447504734386e-06, "loss": -0.0, "num_tokens": 75356967.0, "reward": 0.921875, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 894 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.022324708878079e-09, "advantages/std": 0.6185801029205322, "advantages/var": 0.38264134372917624, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.5706093189964156, "grad_norm": 0.11484738745539673, "learning_rate": 1.5352946616806284e-06, "loss": 0.0, "num_tokens": 75454518.0, "reward": 0.8046875, "reward_std": 0.2012200653553009, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 895 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983378074428632e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.5734767025089607, "grad_norm": 0.11358406298400703, "learning_rate": 1.5343438951499004e-06, "loss": -0.0, "num_tokens": 75527502.0, "reward": 0.84375, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 896 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998246708054356e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 2.576344086021505, "grad_norm": 0.04788623486583116, "learning_rate": 1.5333924520850226e-06, "loss": 0.0, "num_tokens": 75616569.0, "reward": 0.6953125, "reward_std": 0.08443662524223328, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 897 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 8.016503840289217e-09, "advantages/std": 0.5227904319763184, "advantages/var": 0.27330983576598555, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.5792114695340502, "grad_norm": 0.10138377339107882, "learning_rate": 1.5324403336906192e-06, "loss": 0.0, "num_tokens": 75688872.0, "reward": 0.890625, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 898 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380984365940094, "advantages/var": 0.05466704299203351, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.5820788530465952, "grad_norm": 0.040232547209283584, "learning_rate": 1.5314875411721703e-06, "loss": 0.0, "num_tokens": 75760606.0, "reward": 0.78125, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 899 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.58494623655914, "grad_norm": 0.06228123707556885, "learning_rate": 1.5305340757360084e-06, "loss": -0.0, "num_tokens": 75835645.0, "reward": 0.875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 900 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599761052090956e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.5878136200716844, "grad_norm": 0.06372713400930594, "learning_rate": 1.5295799385893187e-06, "loss": 0.0, "num_tokens": 75908865.0, "reward": 0.7890625, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 901 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 5.6335249592517076e-09, "advantages/std": 0.6612716317176819, "advantages/var": 0.4372801709145655, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.5906810035842294, "grad_norm": 0.1520930595061047, "learning_rate": 1.5286251309401367e-06, "loss": 0.0, "num_tokens": 75989610.0, "reward": 0.828125, "reward_std": 0.20517179369926453, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 902 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 6.899641578136434e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.5935483870967744, "grad_norm": 0.12045278274152169, "learning_rate": 1.5276696539973463e-06, "loss": -0.0, "num_tokens": 76073826.0, "reward": 0.7265625, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 903 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.330655038356781, "advantages/var": 0.10933275439072432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 2.596415770609319, "grad_norm": 0.06414423443784778, "learning_rate": 1.5267135089706799e-06, "loss": 0.0, "num_tokens": 76161573.0, "reward": 0.75, "reward_std": 0.0646936446428299, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 904 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998806953079044e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.599283154121864, "grad_norm": 0.0943382436274999, "learning_rate": 1.5257566970707146e-06, "loss": 0.0, "num_tokens": 76243364.0, "reward": 0.7265625, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 905 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199522104181912e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.6021505376344085, "grad_norm": 0.09240194951046704, "learning_rate": 1.5247992195088726e-06, "loss": 0.0, "num_tokens": 76322823.0, "reward": 0.8984375, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 906 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.6050179211469535, "grad_norm": 0.09991208834120584, "learning_rate": 1.5238410774974186e-06, "loss": -0.0, "num_tokens": 76394083.0, "reward": 0.765625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 907 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.5995024336639936e-09, "advantages/std": 0.40496665239334106, "advantages/var": 0.16399798955066913, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.607885304659498, "grad_norm": 0.09315548607081912, "learning_rate": 1.522882272249459e-06, "loss": 0.0, "num_tokens": 76469286.0, "reward": 0.796875, "reward_std": 0.09574718773365021, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 908 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.610752688172043, "grad_norm": 0.06860176298994038, "learning_rate": 1.5219228049789385e-06, "loss": -0.0, "num_tokens": 76542018.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 909 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.967079601050182e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.6136200716845877, "grad_norm": 0.07965350424581469, "learning_rate": 1.5209626769006424e-06, "loss": 0.0, "num_tokens": 76628055.0, "reward": 0.84375, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 910 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344599133202926e-09, "advantages/std": 0.5227646827697754, "advantages/var": 0.2732829135513839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.6164874551971327, "grad_norm": 0.1287594496900276, "learning_rate": 1.52000188923019e-06, "loss": 0.0, "num_tokens": 76698586.0, "reward": 0.9609375, "reward_std": 0.11048543453216553, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 911 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.2946214724693276e-08, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.6193548387096772, "grad_norm": 0.09505569044083684, "learning_rate": 1.5190404431840379e-06, "loss": 0.0, "num_tokens": 76778726.0, "reward": 0.75, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 912 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299897960206267e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.6222222222222222, "grad_norm": 0.0714478703207126, "learning_rate": 1.5180783399794748e-06, "loss": -0.0, "num_tokens": 76863299.0, "reward": 0.8125, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 913 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6262492693233955e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 2.6250896057347672, "grad_norm": 0.11699639846045115, "learning_rate": 1.5171155808346225e-06, "loss": -0.0, "num_tokens": 76953880.0, "reward": 0.65625, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 914 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 1.126665165824288e-08, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.627956989247312, "grad_norm": 0.06128489390695928, "learning_rate": 1.5161521669684324e-06, "loss": 0.0, "num_tokens": 77027011.0, "reward": 0.71875, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 915 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814751550759118e-09, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.6308243727598564, "grad_norm": 0.10874420590707522, "learning_rate": 1.5151880996006849e-06, "loss": 0.0, "num_tokens": 77118787.0, "reward": 0.8046875, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 916 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.2674839170101503e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.6336917562724014, "grad_norm": 0.06940406161791886, "learning_rate": 1.5142233799519888e-06, "loss": 0.0, "num_tokens": 77198046.0, "reward": 0.8359375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 917 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814855139419146e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.6365591397849464, "grad_norm": 0.0862377687131543, "learning_rate": 1.5132580092437776e-06, "loss": 0.0, "num_tokens": 77277263.0, "reward": 0.9140625, "reward_std": 0.12415502220392227, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 918 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.13111686481873e-09, "advantages/std": 0.5726904273033142, "advantages/var": 0.3279743255248526, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.639426523297491, "grad_norm": 0.15485793207595325, "learning_rate": 1.5122919886983101e-06, "loss": 0.0, "num_tokens": 77360134.0, "reward": 0.8046875, "reward_std": 0.16781240701675415, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 919 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.252321502378045e-09, "advantages/std": 0.5727124810218811, "advantages/var": 0.3279995859182385, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.642293906810036, "grad_norm": 0.12812788734814362, "learning_rate": 1.5113253195386669e-06, "loss": -0.0, "num_tokens": 77449426.0, "reward": 0.75, "reward_std": 0.1962026059627533, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 920 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.504893491854984e-09, "advantages/std": 0.5726904273033142, "advantages/var": 0.3279743255248526, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 2.6451612903225805, "grad_norm": 0.16883266529636876, "learning_rate": 1.5103580029887501e-06, "loss": 0.0, "num_tokens": 77528729.0, "reward": 0.8046875, "reward_std": 0.16781240701675415, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 921 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.1383744885263768e-08, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.6480286738351255, "grad_norm": 0.13546924997345466, "learning_rate": 1.5093900402732822e-06, "loss": 0.0, "num_tokens": 77614891.0, "reward": 0.6875, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 922 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907665222004876e-09, "advantages/std": 0.5227646827697754, "advantages/var": 0.2732829135513839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.65089605734767, "grad_norm": 0.15356635061355306, "learning_rate": 1.508421432617803e-06, "loss": 0.0, "num_tokens": 77684846.0, "reward": 0.9609375, "reward_std": 0.11048543453216553, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 923 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814513910737996e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.653763440860215, "grad_norm": 0.16045655005144885, "learning_rate": 1.5074521812486686e-06, "loss": 0.0, "num_tokens": 77763563.0, "reward": 0.765625, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 924 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.528182986834278e-10, "advantages/std": 0.618557333946228, "advantages/var": 0.38261317537866546, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.65663082437276, "grad_norm": 0.12945671903202308, "learning_rate": 1.5064822873930514e-06, "loss": -0.0, "num_tokens": 77843577.0, "reward": 0.84375, "reward_std": 0.17176413536071777, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 925 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.1255179743801086e-09, "advantages/std": 0.5228097438812256, "advantages/var": 0.2733300282971527, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.6594982078853047, "grad_norm": 0.11566487045829575, "learning_rate": 1.5055117522789359e-06, "loss": 0.0, "num_tokens": 77921983.0, "reward": 0.7734375, "reward_std": 0.1593799889087677, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 926 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299897960206267e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.6623655913978492, "grad_norm": 0.0722680302433899, "learning_rate": 1.5045405771351192e-06, "loss": 0.0, "num_tokens": 77991904.0, "reward": 0.9375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 927 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.4937775360182155e-08, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 2.6652329749103942, "grad_norm": 0.1048178773070161, "learning_rate": 1.5035687631912088e-06, "loss": -0.0, "num_tokens": 78077359.0, "reward": 0.8125, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 928 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.757340237782477e-09, "advantages/std": 0.5726904273033142, "advantages/var": 0.3279743255248526, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.6681003584229392, "grad_norm": 0.1607884752224577, "learning_rate": 1.50259631167762e-06, "loss": 0.0, "num_tokens": 78150796.0, "reward": 0.7421875, "reward_std": 0.16781240701675415, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 929 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.126672697608316e-08, "advantages/std": 0.6612905859947205, "advantages/var": 0.4373052391252408, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.670967741935484, "grad_norm": 0.14678061253887872, "learning_rate": 1.5016232238255772e-06, "loss": 0.0, "num_tokens": 78237144.0, "reward": 0.8203125, "reward_std": 0.2301519513130188, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 930 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 8.907107673182429e-09, "advantages/std": 0.5227974057197571, "advantages/var": 0.2733171274273083, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 2.673835125448029, "grad_norm": 0.11465268297178344, "learning_rate": 1.5006495008671088e-06, "loss": -0.0, "num_tokens": 78323828.0, "reward": 0.6015625, "reward_std": 0.14465448260307312, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4915000796318054, "step": 931 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9876049603820392e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.6767025089605734, "grad_norm": 0.0856042022501944, "learning_rate": 1.4996751440350477e-06, "loss": -0.0, "num_tokens": 78409881.0, "reward": 0.8671875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 932 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 2.6795698924731184, "grad_norm": 0.10245524726972559, "learning_rate": 1.4987001545630299e-06, "loss": -0.0, "num_tokens": 78508000.0, "reward": 0.515625, "reward_std": 0.1354655772447586, "rewards/drgrpo_math_reward/mean": 0.515625, "rewards/drgrpo_math_reward/std": 0.5017194747924805, "step": 933 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 8.016397819821542e-09, "advantages/std": 0.5227973461151123, "advantages/var": 0.27331706510500453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.682437275985663, "grad_norm": 0.09704250796406216, "learning_rate": 1.4977245336854917e-06, "loss": 0.0, "num_tokens": 78579769.0, "reward": 0.7734375, "reward_std": 0.14465448260307312, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 934 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.1383871662376608e-08, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.685304659498208, "grad_norm": 0.11486660686377306, "learning_rate": 1.4967482826376697e-06, "loss": 0.0, "num_tokens": 78651612.0, "reward": 0.8671875, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 935 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46757492423057556, "advantages/var": 0.21862630976922848, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.688172043010753, "grad_norm": 0.0874091285719257, "learning_rate": 1.495771402655597e-06, "loss": -0.0, "num_tokens": 78735336.0, "reward": 0.75, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 936 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.987588013390756e-09, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.6910394265232975, "grad_norm": 0.10806235308978829, "learning_rate": 1.4947938949761053e-06, "loss": 0.0, "num_tokens": 78800359.0, "reward": 0.859375, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 937 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.693906810035842, "grad_norm": 0.09387755709107924, "learning_rate": 1.4938157608368173e-06, "loss": 0.0, "num_tokens": 78874400.0, "reward": 0.78125, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 938 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907268126346096e-10, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.696774193548387, "grad_norm": 0.10134195018080254, "learning_rate": 1.492837001476153e-06, "loss": -0.0, "num_tokens": 78965723.0, "reward": 0.8046875, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 939 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878670118891239e-09, "advantages/std": 0.5726904273033142, "advantages/var": 0.3279743255248526, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.699641577060932, "grad_norm": 0.2235023923375762, "learning_rate": 1.491857618133321e-06, "loss": 0.0, "num_tokens": 79045149.0, "reward": 0.8984375, "reward_std": 0.16781240701675415, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 940 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.749390312251308e-09, "advantages/std": 0.4049657881259918, "advantages/var": 0.1639972895525057, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.7025089605734767, "grad_norm": 0.08963897075914033, "learning_rate": 1.4908776120483218e-06, "loss": 0.0, "num_tokens": 79124361.0, "reward": 0.8984375, "reward_std": 0.094686359167099, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 941 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.252446745927492e-09, "advantages/std": 0.5726904273033142, "advantages/var": 0.3279743255248526, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 2.7053763440860212, "grad_norm": 0.11407899853476111, "learning_rate": 1.4898969844619425e-06, "loss": -0.0, "num_tokens": 79220477.0, "reward": 0.7890625, "reward_std": 0.16781240701675415, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 942 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 6.899267866969296e-09, "advantages/std": 0.4049658179283142, "advantages/var": 0.16399731369034853, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.7082437275985662, "grad_norm": 0.09343083442825938, "learning_rate": 1.4889157366157595e-06, "loss": -0.0, "num_tokens": 79298331.0, "reward": 0.7578125, "reward_std": 0.094686359167099, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 943 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958906628562059e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.7111111111111112, "grad_norm": 0.12382972314155807, "learning_rate": 1.487933869752132e-06, "loss": 0.0, "num_tokens": 79378092.0, "reward": 0.8359375, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 944 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.713978494623656, "grad_norm": 0.12254767675960122, "learning_rate": 1.4869513851142049e-06, "loss": 0.0, "num_tokens": 79467881.0, "reward": 0.6875, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 945 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46757492423057556, "advantages/var": 0.21862630976922848, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.716845878136201, "grad_norm": 0.31136183346146684, "learning_rate": 1.485968283945904e-06, "loss": 0.0, "num_tokens": 79544328.0, "reward": 0.75, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 946 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.7197132616487454, "grad_norm": 0.0, "learning_rate": 1.4849845674919364e-06, "loss": 0.0, "num_tokens": 79622291.0, "reward": 0.875, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 947 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962814881146116e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.7225806451612904, "grad_norm": 0.07069162132403303, "learning_rate": 1.4840002369977878e-06, "loss": 0.0, "num_tokens": 79698502.0, "reward": 0.9453125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 948 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.2524704417381583e-09, "advantages/std": 0.5726862549781799, "advantages/var": 0.3279695466409329, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 2.725448028673835, "grad_norm": 0.13405538041277784, "learning_rate": 1.4830152937097218e-06, "loss": -0.0, "num_tokens": 79788916.0, "reward": 0.6953125, "reward_std": 0.16439256072044373, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 949 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.72831541218638, "grad_norm": 0.10862978629884379, "learning_rate": 1.4820297388747771e-06, "loss": 0.0, "num_tokens": 79871575.0, "reward": 0.7890625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 950 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.5628747534061713e-09, "advantages/std": 0.5227927565574646, "advantages/var": 0.27331226630895245, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.731182795698925, "grad_norm": 0.09485759290377778, "learning_rate": 1.4810435737407677e-06, "loss": -0.0, "num_tokens": 79946601.0, "reward": 0.7578125, "reward_std": 0.1412346363067627, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 951 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.149940263022739e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.7340501792114695, "grad_norm": 0.100642066311835, "learning_rate": 1.480056799556279e-06, "loss": -0.0, "num_tokens": 80017444.0, "reward": 0.8828125, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 952 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.736917562724014, "grad_norm": 0.07338200894842473, "learning_rate": 1.4790694175706695e-06, "loss": -0.0, "num_tokens": 80093621.0, "reward": 0.7421875, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 953 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 2.739784946236559, "grad_norm": 0.032849686700925966, "learning_rate": 1.4780814290340649e-06, "loss": -0.0, "num_tokens": 80164645.0, "reward": 0.84375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 954 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.3444321679928155e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.742652329749104, "grad_norm": 0.1503840253544236, "learning_rate": 1.4770928351973603e-06, "loss": -0.0, "num_tokens": 80246580.0, "reward": 0.796875, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 955 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 8.962349336466492e-09, "advantages/std": 0.4676175117492676, "advantages/var": 0.2186661372945764, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.7455197132616487, "grad_norm": 0.07668240362001452, "learning_rate": 1.476103637312217e-06, "loss": -0.0, "num_tokens": 80323828.0, "reward": 0.8515625, "reward_std": 0.130448117852211, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 956 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.3444321679928155e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 2.7483870967741937, "grad_norm": 0.10629153913011348, "learning_rate": 1.475113836631061e-06, "loss": 0.0, "num_tokens": 80410044.0, "reward": 0.6875, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 957 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.379887186086637e-08, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.7512544802867382, "grad_norm": 0.06630528689284595, "learning_rate": 1.474123434407081e-06, "loss": 0.0, "num_tokens": 80496741.0, "reward": 0.8203125, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 958 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.7541218637992833, "grad_norm": 0.10162467858428631, "learning_rate": 1.4731324318942283e-06, "loss": 0.0, "num_tokens": 80571116.0, "reward": 0.796875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 959 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.0689055673126508e-08, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.756989247311828, "grad_norm": 0.09520805259728901, "learning_rate": 1.4721408303472131e-06, "loss": 0.0, "num_tokens": 80664243.0, "reward": 0.828125, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 960 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.958849501312727e-10, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.759856630824373, "grad_norm": 0.1069432224121283, "learning_rate": 1.471148631021505e-06, "loss": 0.0, "num_tokens": 80742648.0, "reward": 0.875, "reward_std": 0.09863808006048203, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 961 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494468808174133, "advantages/var": 0.16398020040561878, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.762724014336918, "grad_norm": 0.07304136325985482, "learning_rate": 1.4701558351733302e-06, "loss": -0.0, "num_tokens": 80819619.0, "reward": 0.84375, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 962 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983467187183905e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.7655913978494624, "grad_norm": 0.076909777483124, "learning_rate": 1.4691624440596696e-06, "loss": 0.0, "num_tokens": 80912320.0, "reward": 0.734375, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 963 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.768458781362007, "grad_norm": 0.08451304598371802, "learning_rate": 1.468168458938258e-06, "loss": 0.0, "num_tokens": 80992896.0, "reward": 0.75, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 964 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.527936351672609e-10, "advantages/std": 0.6185775995254517, "advantages/var": 0.38263824663467005, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.771326164874552, "grad_norm": 0.19101846437724104, "learning_rate": 1.4671738810675836e-06, "loss": 0.0, "num_tokens": 81078065.0, "reward": 0.7109375, "reward_std": 0.19674429297447205, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 965 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.774193548387097, "grad_norm": 0.07835465741994906, "learning_rate": 1.4661787117068825e-06, "loss": 0.0, "num_tokens": 81151463.0, "reward": 0.90625, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 966 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 8.049325332145131e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.7770609318996415, "grad_norm": 0.05479424124139151, "learning_rate": 1.4651829521161424e-06, "loss": 0.0, "num_tokens": 81234515.0, "reward": 0.8125, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 967 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917813257124117e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.7799283154121865, "grad_norm": 0.16588886508677797, "learning_rate": 1.4641866035560959e-06, "loss": 0.0, "num_tokens": 81305873.0, "reward": 0.8203125, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 968 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.967257841033182e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.782795698924731, "grad_norm": 0.1158578530989859, "learning_rate": 1.4631896672882234e-06, "loss": 0.0, "num_tokens": 81385432.0, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 969 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958501673983143e-09, "advantages/std": 0.4676017463207245, "advantages/var": 0.21865139316219118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.785663082437276, "grad_norm": 0.10413897332282132, "learning_rate": 1.4621921445747477e-06, "loss": 0.0, "num_tokens": 81462375.0, "reward": 0.8046875, "reward_std": 0.11336850374937057, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 970 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.404969722032547, "advantages/var": 0.16400047576311838, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.7885304659498207, "grad_norm": 0.08097482759759493, "learning_rate": 1.461194036678635e-06, "loss": -0.0, "num_tokens": 81533072.0, "reward": 0.8359375, "reward_std": 0.09916213154792786, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 971 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.691639042734874e-09, "advantages/std": 0.5727047920227051, "advantages/var": 0.3279907788057699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.7913978494623657, "grad_norm": 0.14788883072954456, "learning_rate": 1.4601953448635927e-06, "loss": 0.0, "num_tokens": 81615351.0, "reward": 0.8046875, "reward_std": 0.18383610248565674, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 972 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907256955369e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.7942652329749103, "grad_norm": 0.22342003375054953, "learning_rate": 1.459196070394066e-06, "loss": -0.0, "num_tokens": 81693213.0, "reward": 0.828125, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 973 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450174584180815e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.7971326164874553, "grad_norm": 0.12356303934435144, "learning_rate": 1.4581962145352402e-06, "loss": 0.0, "num_tokens": 81764294.0, "reward": 0.84375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 974 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.9792225930357895e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 2.8, "grad_norm": 0.12701479463293694, "learning_rate": 1.457195778553034e-06, "loss": 0.0, "num_tokens": 81849035.0, "reward": 0.71875, "reward_std": 0.11678344756364822, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 975 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.2470159737516598e-08, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.802867383512545, "grad_norm": 0.13362216416825526, "learning_rate": 1.4561947637141029e-06, "loss": 0.0, "num_tokens": 81923931.0, "reward": 0.859375, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 976 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.50560656552406e-09, "advantages/std": 0.6185696721076965, "advantages/var": 0.3826284392514232, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 2.80573476702509, "grad_norm": 0.11692498741580577, "learning_rate": 1.4551931712858331e-06, "loss": 0.0, "num_tokens": 82009690.0, "reward": 0.7734375, "reward_std": 0.18649455904960632, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 977 }, { "advantages/mean": 6.984919309616089e-09, "advantages/snr": 1.1291831627915751e-08, "advantages/std": 0.6185815930366516, "advantages/var": 0.3826431872437617, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.8086021505376344, "grad_norm": 0.1406608507322815, "learning_rate": 1.454191002536345e-06, "loss": -0.0, "num_tokens": 82088904.0, "reward": 0.7578125, "reward_std": 0.20357416570186615, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 978 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.811469534050179, "grad_norm": 0.09124679518816567, "learning_rate": 1.4531882587344857e-06, "loss": 0.0, "num_tokens": 82178261.0, "reward": 0.703125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 979 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.814336917562724, "grad_norm": 0.06809812851547395, "learning_rate": 1.4521849411498318e-06, "loss": 0.0, "num_tokens": 82251057.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 980 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.817204301075269, "grad_norm": 0.1073867350067464, "learning_rate": 1.4511810510526867e-06, "loss": 0.0, "num_tokens": 82329285.0, "reward": 0.7890625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 981 }, { "advantages/mean": -9.313225746154785e-09, "advantages/snr": 1.3278335481571027e-08, "advantages/std": 0.7013850212097168, "advantages/var": 0.4919409479773549, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.8200716845878135, "grad_norm": 0.16588912014450993, "learning_rate": 1.4501765897140778e-06, "loss": -0.0, "num_tokens": 82417959.0, "reward": 0.84375, "reward_std": 0.22962789237499237, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 982 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.8229390681003586, "grad_norm": 0.052627823166171435, "learning_rate": 1.449171558405756e-06, "loss": 0.0, "num_tokens": 82503360.0, "reward": 0.65625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 983 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 9.798125641320161e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 2.825806451612903, "grad_norm": 0.12406921859703168, "learning_rate": 1.4481659584001946e-06, "loss": 0.0, "num_tokens": 82592272.0, "reward": 0.875, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 984 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.258355009955157e-09, "advantages/std": 0.6185846924781799, "advantages/var": 0.38264702176832444, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.828673835125448, "grad_norm": 0.14853677123319015, "learning_rate": 1.4471597909705855e-06, "loss": 0.0, "num_tokens": 82690245.0, "reward": 0.765625, "reward_std": 0.20911076664924622, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 985 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.970882751882131e-09, "advantages/std": 0.4676063358783722, "advantages/var": 0.21865568535359703, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 2.8315412186379927, "grad_norm": 0.07254601773018031, "learning_rate": 1.4461530573908406e-06, "loss": 0.0, "num_tokens": 82782718.0, "reward": 0.71875, "reward_std": 0.11913755536079407, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 986 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 5.633244590331673e-09, "advantages/std": 0.33065226674079895, "advantages/var": 0.10933092150082846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.8344086021505377, "grad_norm": 0.09383271684892873, "learning_rate": 1.4451457589355872e-06, "loss": -0.0, "num_tokens": 82864889.0, "reward": 0.859375, "reward_std": 0.06233953312039375, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 987 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.0349462367204652e-08, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.8372759856630827, "grad_norm": 0.14252668262964652, "learning_rate": 1.4441378968801686e-06, "loss": 0.0, "num_tokens": 82943320.0, "reward": 0.8359375, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 988 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 1.126665165824288e-08, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.8401433691756273, "grad_norm": 0.08642639406893778, "learning_rate": 1.4431294725006413e-06, "loss": 0.0, "num_tokens": 83024227.0, "reward": 0.84375, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 989 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.974942206861495e-09, "advantages/std": 0.46761417388916016, "advantages/var": 0.2186630156220417, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.843010752688172, "grad_norm": 0.08548485932771985, "learning_rate": 1.4421204870737745e-06, "loss": -0.0, "num_tokens": 83114552.0, "reward": 0.6953125, "reward_std": 0.12597234547138214, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 990 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814432667740602e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.845878136200717, "grad_norm": 0.10774296462925989, "learning_rate": 1.4411109418770465e-06, "loss": 0.0, "num_tokens": 83196364.0, "reward": 0.7578125, "reward_std": 0.13888053596019745, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 991 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.848745519713262, "grad_norm": 0.05479591059822382, "learning_rate": 1.4401008381886457e-06, "loss": 0.0, "num_tokens": 83282399.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 992 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.258368719288218e-09, "advantages/std": 0.6185809373855591, "advantages/var": 0.38264237609679697, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 2.8516129032258064, "grad_norm": 0.09920848222282774, "learning_rate": 1.4390901772874666e-06, "loss": -0.0, "num_tokens": 83371633.0, "reward": 0.859375, "reward_std": 0.19910329580307007, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 993 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.8544802867383514, "grad_norm": 0.08555965379344638, "learning_rate": 1.4380789604531094e-06, "loss": 0.0, "num_tokens": 83459958.0, "reward": 0.7265625, "reward_std": 0.12863080203533173, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 994 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.2469952164166843e-08, "advantages/std": 0.5227973461151123, "advantages/var": 0.27331706510500453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.857347670250896, "grad_norm": 0.08960529128256936, "learning_rate": 1.4370671889658782e-06, "loss": -0.0, "num_tokens": 83543626.0, "reward": 0.8046875, "reward_std": 0.14465448260307312, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 995 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125942055767658e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.860215053763441, "grad_norm": 0.09127515038097096, "learning_rate": 1.4360548641067798e-06, "loss": 0.0, "num_tokens": 83618205.0, "reward": 0.9453125, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 996 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983628920516591e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.8630824372759855, "grad_norm": 0.11549919848538252, "learning_rate": 1.4350419871575208e-06, "loss": -0.0, "num_tokens": 83691289.0, "reward": 0.828125, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 997 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380985856056213, "advantages/var": 0.05466704996011007, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 2.8659498207885306, "grad_norm": 0.08600165163589032, "learning_rate": 1.4340285594005078e-06, "loss": -0.0, "num_tokens": 83774046.0, "reward": 0.78125, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 998 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.868817204301075, "grad_norm": 0.0902222187264063, "learning_rate": 1.4330145821188434e-06, "loss": 0.0, "num_tokens": 83854337.0, "reward": 0.953125, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 999 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 2.87168458781362, "grad_norm": 0.13998648226573715, "learning_rate": 1.432000056596328e-06, "loss": -0.0, "num_tokens": 83933985.0, "reward": 0.8515625, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1000 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.8745519713261647, "grad_norm": 0.05841281416678034, "learning_rate": 1.4309849841174535e-06, "loss": 0.0, "num_tokens": 84005555.0, "reward": 0.7890625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1001 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.130963692329808e-10, "advantages/std": 0.5727012157440186, "advantages/var": 0.3279866825146769, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.8774193548387097, "grad_norm": 0.134757766435047, "learning_rate": 1.429969365967407e-06, "loss": 0.0, "num_tokens": 84088581.0, "reward": 0.765625, "reward_std": 0.18147708475589752, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1002 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983495876754114e-09, "advantages/std": 0.4675905704498291, "advantages/var": 0.2186409415735966, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 2.8802867383512547, "grad_norm": 0.10776129736819205, "learning_rate": 1.4289532034320647e-06, "loss": 0.0, "num_tokens": 84165395.0, "reward": 0.796875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1003 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.906988860324416e-09, "advantages/std": 0.5228043794631958, "advantages/var": 0.2733244191858972, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.8831541218637993, "grad_norm": 0.12360334360957083, "learning_rate": 1.427936497797992e-06, "loss": -0.0, "num_tokens": 84253316.0, "reward": 0.671875, "reward_std": 0.1514892876148224, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 1004 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.3361080419982039e-08, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.886021505376344, "grad_norm": 0.11279355138455799, "learning_rate": 1.4269192503524432e-06, "loss": 0.0, "num_tokens": 84345585.0, "reward": 0.875, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1005 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.888888888888889, "grad_norm": 0.062167831208651166, "learning_rate": 1.4259014623833576e-06, "loss": 0.0, "num_tokens": 84425396.0, "reward": 0.8828125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1006 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.891756272401434, "grad_norm": 0.07904456306252947, "learning_rate": 1.424883135179359e-06, "loss": -0.0, "num_tokens": 84503969.0, "reward": 0.859375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1007 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516924590743576e-09, "advantages/std": 0.6185553073883057, "advantages/var": 0.3826106682982413, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.8946236559139784, "grad_norm": 0.15491260783383373, "learning_rate": 1.4238642700297544e-06, "loss": -0.0, "num_tokens": 84591781.0, "reward": 0.8203125, "reward_std": 0.16834920644760132, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1008 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.8974910394265234, "grad_norm": 0.0905853947441121, "learning_rate": 1.422844868224531e-06, "loss": -0.0, "num_tokens": 84670631.0, "reward": 0.9375, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1009 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.958849501312727e-10, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.900358422939068, "grad_norm": 0.10623597528855593, "learning_rate": 1.4218249310543562e-06, "loss": -0.0, "num_tokens": 84751482.0, "reward": 0.875, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1010 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 2.903225806451613, "grad_norm": 0.08308100916550797, "learning_rate": 1.4208044598105754e-06, "loss": -0.0, "num_tokens": 84838729.0, "reward": 0.796875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1011 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.9060931899641576, "grad_norm": 0.06735697884255029, "learning_rate": 1.419783455785209e-06, "loss": -0.0, "num_tokens": 84917736.0, "reward": 0.7578125, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1012 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.9089605734767026, "grad_norm": 0.10682262758743971, "learning_rate": 1.4187619202709536e-06, "loss": 0.0, "num_tokens": 84999114.0, "reward": 0.890625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1013 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 5.6333472398789395e-09, "advantages/std": 0.6612924933433533, "advantages/var": 0.43730776175226893, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 2.9118279569892476, "grad_norm": 0.10684262770251247, "learning_rate": 1.4177398545611775e-06, "loss": 0.0, "num_tokens": 85096639.0, "reward": 0.625, "reward_std": 0.23356688022613525, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 1014 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016755193120049e-09, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.914695340501792, "grad_norm": 0.09541058678290444, "learning_rate": 1.4167172599499207e-06, "loss": 0.0, "num_tokens": 85176369.0, "reward": 0.8984375, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1015 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344456541825744e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.9175627240143367, "grad_norm": 0.09885274980236455, "learning_rate": 1.4156941377318927e-06, "loss": 0.0, "num_tokens": 85255259.0, "reward": 0.7890625, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1016 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.504875212414157e-09, "advantages/std": 0.5726920366287231, "advantages/var": 0.32797616881795477, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.9204301075268817, "grad_norm": 0.13566379630410266, "learning_rate": 1.4146704892024711e-06, "loss": 0.0, "num_tokens": 85335500.0, "reward": 0.7734375, "reward_std": 0.1701665222644806, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1017 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.97930573908082e-09, "advantages/std": 0.4675965905189514, "advantages/var": 0.21864657146494793, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 2.9232974910394267, "grad_norm": 0.08449164175546724, "learning_rate": 1.4136463156577004e-06, "loss": -0.0, "num_tokens": 85420546.0, "reward": 0.8046875, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1018 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149914704966296e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.9261648745519713, "grad_norm": 0.110435170845307, "learning_rate": 1.4126216183942886e-06, "loss": -0.0, "num_tokens": 85501354.0, "reward": 0.75, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1019 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.749358156051495e-09, "advantages/std": 0.4049680531024933, "advantages/var": 0.16399912403362382, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.9290322580645163, "grad_norm": 0.09421270821046172, "learning_rate": 1.4115963987096078e-06, "loss": -0.0, "num_tokens": 85593918.0, "reward": 0.7734375, "reward_std": 0.09704046696424484, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1020 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958294131658464e-10, "advantages/std": 0.46761149168014526, "advantages/var": 0.21866050715133056, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.931899641577061, "grad_norm": 0.0775638917687058, "learning_rate": 1.4105706579016914e-06, "loss": -0.0, "num_tokens": 85687755.0, "reward": 0.71875, "reward_std": 0.12255740165710449, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 1021 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.2946288161655503e-08, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.934767025089606, "grad_norm": 0.11807849656225168, "learning_rate": 1.409544397269232e-06, "loss": 0.0, "num_tokens": 85761830.0, "reward": 0.9453125, "reward_std": 0.1054728701710701, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1022 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344282883117038e-09, "advantages/std": 0.5227956175804138, "advantages/var": 0.2733152577612863, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.9376344086021504, "grad_norm": 0.11236698481614323, "learning_rate": 1.408517618111581e-06, "loss": -0.0, "num_tokens": 85849256.0, "reward": 0.7578125, "reward_std": 0.14230036735534668, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1023 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.264868475059852e-08, "advantages/std": 0.40496495366096497, "advantages/var": 0.1639966136936275, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 2.9405017921146954, "grad_norm": 0.1800267764243477, "learning_rate": 1.407490321728746e-06, "loss": 0.0, "num_tokens": 85934211.0, "reward": 0.71875, "reward_std": 0.0936255231499672, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 1024 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 2.94336917562724, "grad_norm": 0.07265991277831217, "learning_rate": 1.4064625094213898e-06, "loss": 0.0, "num_tokens": 86003296.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1025 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.946236559139785, "grad_norm": 0.13200183927295472, "learning_rate": 1.405434182490828e-06, "loss": -0.0, "num_tokens": 86073842.0, "reward": 0.8203125, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1026 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 9.958667967959763e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 2.9491039426523296, "grad_norm": 0.11495201119172228, "learning_rate": 1.4044053422390278e-06, "loss": 0.0, "num_tokens": 86154117.0, "reward": 0.734375, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1027 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516696093096275e-09, "advantages/std": 0.6185865998268127, "advantages/var": 0.38264938148529737, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 2.9519713261648746, "grad_norm": 0.10893709159700979, "learning_rate": 1.4033759899686061e-06, "loss": 0.0, "num_tokens": 86240235.0, "reward": 0.7734375, "reward_std": 0.20911568403244019, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1028 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4676129221916199, "advantages/var": 0.21866184500058594, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.9548387096774196, "grad_norm": 0.09579100145553239, "learning_rate": 1.4023461269828296e-06, "loss": 0.0, "num_tokens": 86325908.0, "reward": 0.78125, "reward_std": 0.1246790662407875, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1029 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983629174425397e-09, "advantages/std": 0.46757492423057556, "advantages/var": 0.21862630976922848, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.957706093189964, "grad_norm": 0.08513116523147185, "learning_rate": 1.401315754585609e-06, "loss": 0.0, "num_tokens": 86414814.0, "reward": 0.734375, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1030 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262171105692396e-09, "advantages/std": 0.5726926326751709, "advantages/var": 0.3279768515204182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 2.9605734767025087, "grad_norm": 0.1262348988136174, "learning_rate": 1.400284874081502e-06, "loss": -0.0, "num_tokens": 86503399.0, "reward": 0.765625, "reward_std": 0.1712273508310318, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1031 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.9634408602150537, "grad_norm": 0.08103645772255394, "learning_rate": 1.3992534867757089e-06, "loss": 0.0, "num_tokens": 86564084.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1032 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633244590331673e-09, "advantages/std": 0.33065226674079895, "advantages/var": 0.10933092150082846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.9663082437275987, "grad_norm": 0.0849378340052373, "learning_rate": 1.3982215939740725e-06, "loss": 0.0, "num_tokens": 86648887.0, "reward": 0.703125, "reward_std": 0.06233953312039375, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 1033 }, { "advantages/mean": -8.381903171539307e-09, "advantages/snr": 1.463608042839594e-08, "advantages/std": 0.5726876854896545, "advantages/var": 0.3279711851114975, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.9691756272401433, "grad_norm": 0.281596752146975, "learning_rate": 1.3971891969830733e-06, "loss": 0.0, "num_tokens": 86729674.0, "reward": 0.9140625, "reward_std": 0.1633366346359253, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1034 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.9720430107526883, "grad_norm": 0.11168309540429934, "learning_rate": 1.3961562971098335e-06, "loss": -0.0, "num_tokens": 86811377.0, "reward": 0.75, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1035 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6185722351074219, "advantages/var": 0.3826316100457916, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 2.974910394265233, "grad_norm": 0.14349750402415082, "learning_rate": 1.395122895662109e-06, "loss": 0.0, "num_tokens": 86898169.0, "reward": 0.7265625, "reward_std": 0.19097033143043518, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 1036 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 2.977777777777778, "grad_norm": 0.0785080644075257, "learning_rate": 1.3940889939482923e-06, "loss": 0.0, "num_tokens": 86978019.0, "reward": 0.734375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1037 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.9806451612903224, "grad_norm": 0.1306098130087868, "learning_rate": 1.3930545932774092e-06, "loss": 0.0, "num_tokens": 87049853.0, "reward": 0.796875, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1038 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 1.0349153895649778e-08, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.9835125448028674, "grad_norm": 0.09788399616171604, "learning_rate": 1.3920196949591166e-06, "loss": -0.0, "num_tokens": 87135503.0, "reward": 0.7890625, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1039 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.131077936456911e-10, "advantages/std": 0.5726931691169739, "advantages/var": 0.32797746595324284, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 2.9863799283154124, "grad_norm": 0.1527075830682524, "learning_rate": 1.390984300303702e-06, "loss": 0.0, "num_tokens": 87222915.0, "reward": 0.7734375, "reward_std": 0.172288179397583, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1040 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.989247311827957, "grad_norm": 0.046094532672897426, "learning_rate": 1.3899484106220814e-06, "loss": 0.0, "num_tokens": 87297903.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1041 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46757492423057556, "advantages/var": 0.21862630976922848, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 2.9921146953405016, "grad_norm": 0.0983977964273471, "learning_rate": 1.388912027225797e-06, "loss": -0.0, "num_tokens": 87371974.0, "reward": 0.8125, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1042 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.2998806953079044e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 2.9949820788530466, "grad_norm": 0.10504310759419465, "learning_rate": 1.3878751514270169e-06, "loss": -0.0, "num_tokens": 87445745.0, "reward": 0.8984375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1043 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917480653229804e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 2.9978494623655916, "grad_norm": 0.08956076137213259, "learning_rate": 1.3868377845385317e-06, "loss": -0.0, "num_tokens": 87531820.0, "reward": 0.84375, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1044 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46760615706443787, "advantages/var": 0.21865551812457173, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.002867383512545, "grad_norm": 0.11278050956106807, "learning_rate": 1.3857999278737545e-06, "loss": -0.0, "num_tokens": 87611902.0, "reward": 0.84375, "reward_std": 0.1157275140285492, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1045 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633244590331673e-09, "advantages/std": 0.33065226674079895, "advantages/var": 0.10933092150082846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.0057347670250896, "grad_norm": 0.07363958021084817, "learning_rate": 1.384761582746718e-06, "loss": 0.0, "num_tokens": 87678237.0, "reward": 0.953125, "reward_std": 0.06233953312039375, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 1046 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.5209261236260495e-09, "advantages/std": 0.6612766981124878, "advantages/var": 0.4372868714665543, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 3.0086021505376346, "grad_norm": 0.15511405904834488, "learning_rate": 1.3837227504720736e-06, "loss": -0.0, "num_tokens": 87766887.0, "reward": 0.578125, "reward_std": 0.2109457552433014, "rewards/drgrpo_math_reward/mean": 0.578125, "rewards/drgrpo_math_reward/std": 0.4957992732524872, "step": 1047 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726862549781799, "advantages/var": 0.3279695466409329, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.011469534050179, "grad_norm": 0.14518322064472808, "learning_rate": 1.3826834323650898e-06, "loss": 0.0, "num_tokens": 87846837.0, "reward": 0.7890625, "reward_std": 0.16439256072044373, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1048 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227956175804138, "advantages/var": 0.2733152577612863, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 3.014336917562724, "grad_norm": 0.09863025981033159, "learning_rate": 1.3816436297416494e-06, "loss": -0.0, "num_tokens": 87928695.0, "reward": 0.8203125, "reward_std": 0.14230038225650787, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1049 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299846843318419e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.0172043010752687, "grad_norm": 0.10991296190658557, "learning_rate": 1.3806033439182497e-06, "loss": -0.0, "num_tokens": 88007220.0, "reward": 0.8046875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1050 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.0200716845878137, "grad_norm": 0.08240359629638405, "learning_rate": 1.3795625762119985e-06, "loss": -0.0, "num_tokens": 88091728.0, "reward": 0.7265625, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 1051 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.0349462367204652e-08, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.0229390681003583, "grad_norm": 0.08775335038983857, "learning_rate": 1.3785213279406146e-06, "loss": 0.0, "num_tokens": 88167242.0, "reward": 0.8984375, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1052 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.0258064516129033, "grad_norm": 0.12907689906371417, "learning_rate": 1.3774796004224256e-06, "loss": -0.0, "num_tokens": 88234112.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1053 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149914704966296e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.028673835125448, "grad_norm": 0.09909875372545414, "learning_rate": 1.3764373949763645e-06, "loss": -0.0, "num_tokens": 88304660.0, "reward": 0.9375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1054 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262188031035393e-09, "advantages/std": 0.5726920366287231, "advantages/var": 0.32797616881795477, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.031541218637993, "grad_norm": 0.1472668433116852, "learning_rate": 1.375394712921971e-06, "loss": -0.0, "num_tokens": 88400732.0, "reward": 0.7578125, "reward_std": 0.1701665222644806, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1055 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958574030221743e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.0344086021505374, "grad_norm": 0.11660139742796921, "learning_rate": 1.374351555579387e-06, "loss": 0.0, "num_tokens": 88469780.0, "reward": 0.9453125, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1056 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983533706996105e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.0372759856630824, "grad_norm": 0.10045295452108793, "learning_rate": 1.3733079242693571e-06, "loss": 0.0, "num_tokens": 88540617.0, "reward": 0.8359375, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1057 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.0401433691756274, "grad_norm": 0.10774423597192603, "learning_rate": 1.372263820313225e-06, "loss": 0.0, "num_tokens": 88619796.0, "reward": 0.9140625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1058 }, { "advantages/mean": 5.587935447692871e-09, "advantages/snr": 1.379866977655094e-08, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.043010752688172, "grad_norm": 0.14459464886811796, "learning_rate": 1.3712192450329336e-06, "loss": -0.0, "num_tokens": 88704086.0, "reward": 0.8671875, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1059 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.453528449034464e-09, "advantages/std": 0.5228003859519958, "advantages/var": 0.2733202435515558, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.045878136200717, "grad_norm": 0.20995250081848818, "learning_rate": 1.3701741997510221e-06, "loss": 0.0, "num_tokens": 88782665.0, "reward": 0.8046875, "reward_std": 0.14913025498390198, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1060 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 7.96706741399221e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.0487455197132616, "grad_norm": 0.12442638943183558, "learning_rate": 1.3691286857906251e-06, "loss": 0.0, "num_tokens": 88854072.0, "reward": 0.7734375, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1061 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.0112256078484365e-09, "advantages/std": 0.6185671091079712, "advantages/var": 0.38262526847019274, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.0516129032258066, "grad_norm": 0.14621710227022958, "learning_rate": 1.3680827044754707e-06, "loss": -0.0, "num_tokens": 88945302.0, "reward": 0.9140625, "reward_std": 0.18201878666877747, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1062 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.453534034272115e-09, "advantages/std": 0.5227997303009033, "advantages/var": 0.27331955800269725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.054480286738351, "grad_norm": 0.10101354256707841, "learning_rate": 1.367036257129878e-06, "loss": 0.0, "num_tokens": 89035504.0, "reward": 0.828125, "reward_std": 0.14806944131851196, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1063 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299846843318419e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.057347670250896, "grad_norm": 0.09216983462994441, "learning_rate": 1.3659893450787573e-06, "loss": 0.0, "num_tokens": 89116768.0, "reward": 0.8671875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1064 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9876548503938182e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.0602150537634407, "grad_norm": 0.07296259186673848, "learning_rate": 1.3649419696476055e-06, "loss": -0.0, "num_tokens": 89193429.0, "reward": 0.765625, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1065 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.0630824372759857, "grad_norm": 0.1040375514130535, "learning_rate": 1.3638941321625084e-06, "loss": 0.0, "num_tokens": 89261258.0, "reward": 0.8515625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1066 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.0659498207885303, "grad_norm": 0.09775719772800988, "learning_rate": 1.3628458339501347e-06, "loss": -0.0, "num_tokens": 89356079.0, "reward": 0.8125, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1067 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 3.0688172043010753, "grad_norm": 0.09456317741154854, "learning_rate": 1.3617970763377383e-06, "loss": 0.0, "num_tokens": 89453113.0, "reward": 0.8125, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1068 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.453592933994712e-09, "advantages/std": 0.5227928161621094, "advantages/var": 0.2733123286307091, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.07168458781362, "grad_norm": 0.10740696477457821, "learning_rate": 1.360747860653153e-06, "loss": -0.0, "num_tokens": 89537226.0, "reward": 0.5390625, "reward_std": 0.1412346363067627, "rewards/drgrpo_math_reward/mean": 0.5390625, "rewards/drgrpo_math_reward/std": 0.5004304051399231, "step": 1069 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983473280509385e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 3.074551971326165, "grad_norm": 0.10686438889895704, "learning_rate": 1.3596981882247942e-06, "loss": -0.0, "num_tokens": 89619330.0, "reward": 0.7265625, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 1070 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.07741935483871, "grad_norm": 0.09464056194453326, "learning_rate": 1.3586480603816543e-06, "loss": 0.0, "num_tokens": 89704066.0, "reward": 0.703125, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 1071 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.0802867383512544, "grad_norm": 0.18192909878470326, "learning_rate": 1.3575974784533031e-06, "loss": -0.0, "num_tokens": 89778195.0, "reward": 0.8671875, "reward_std": 0.12415502220392227, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1072 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.9750579720916185e-09, "advantages/std": 0.4676051139831543, "advantages/var": 0.21865454262319872, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.0831541218637994, "grad_norm": 0.1056231633941552, "learning_rate": 1.3565464437698848e-06, "loss": -0.0, "num_tokens": 89856346.0, "reward": 0.8515625, "reward_std": 0.11784426867961884, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1073 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299880526045478e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.086021505376344, "grad_norm": 0.12922868766375104, "learning_rate": 1.355494957662117e-06, "loss": -0.0, "num_tokens": 89940852.0, "reward": 0.8203125, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1074 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.088888888888889, "grad_norm": 0.15690728990169092, "learning_rate": 1.3544430214612895e-06, "loss": 0.0, "num_tokens": 90018486.0, "reward": 0.71875, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 1075 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975244195968941e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.0917562724014336, "grad_norm": 0.14509465934671859, "learning_rate": 1.3533906364992604e-06, "loss": 0.0, "num_tokens": 90090651.0, "reward": 0.703125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 1076 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966859224177393e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.0946236559139786, "grad_norm": 0.11225193068731186, "learning_rate": 1.3523378041084574e-06, "loss": 0.0, "num_tokens": 90171019.0, "reward": 0.6953125, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 1077 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.097491039426523, "grad_norm": 0.08430816486137278, "learning_rate": 1.3512845256218746e-06, "loss": 0.0, "num_tokens": 90244863.0, "reward": 0.78125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1078 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5056365973668555e-09, "advantages/std": 0.618557333946228, "advantages/var": 0.38261317537866546, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.100358422939068, "grad_norm": 0.13897323270534442, "learning_rate": 1.35023080237307e-06, "loss": 0.0, "num_tokens": 90322473.0, "reward": 0.796875, "reward_std": 0.17176413536071777, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1079 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6185835003852844, "advantages/var": 0.3826455469489112, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 3.1032258064516127, "grad_norm": 0.1296860171128417, "learning_rate": 1.3491766356961658e-06, "loss": -0.0, "num_tokens": 90413848.0, "reward": 0.671875, "reward_std": 0.20357906818389893, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 1080 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516924590743576e-09, "advantages/std": 0.6185553073883057, "advantages/var": 0.3826106682982413, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.1060931899641577, "grad_norm": 0.09957232647714506, "learning_rate": 1.3481220269258446e-06, "loss": 0.0, "num_tokens": 90508557.0, "reward": 0.8046875, "reward_std": 0.16834920644760132, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1081 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.975176026781512e-09, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 3.1089605734767023, "grad_norm": 0.11632736235577498, "learning_rate": 1.3470669773973495e-06, "loss": -0.0, "num_tokens": 90595707.0, "reward": 0.796875, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1082 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.1118279569892473, "grad_norm": 0.08457504067488755, "learning_rate": 1.3460114884464813e-06, "loss": 0.0, "num_tokens": 90658904.0, "reward": 0.890625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1083 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344385248990318e-09, "advantages/std": 0.5227856040000916, "advantages/var": 0.27330478774974054, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.1146953405017923, "grad_norm": 0.13503806139774452, "learning_rate": 1.3449555614095968e-06, "loss": 0.0, "num_tokens": 90732573.0, "reward": 0.875, "reward_std": 0.13098980486392975, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1084 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599796258942519e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.117562724014337, "grad_norm": 0.08829586881255871, "learning_rate": 1.3438991976236084e-06, "loss": 0.0, "num_tokens": 90806880.0, "reward": 0.859375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1085 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.120430107526882, "grad_norm": 0.1341238536550148, "learning_rate": 1.342842398425981e-06, "loss": -0.0, "num_tokens": 90881116.0, "reward": 0.8125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1086 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.0954289704678737e-08, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.1232974910394264, "grad_norm": 0.09064624961768537, "learning_rate": 1.3417851651547306e-06, "loss": 0.0, "num_tokens": 90971531.0, "reward": 0.734375, "reward_std": 0.11678344756364822, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1087 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907386946654693e-10, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.1261648745519715, "grad_norm": 0.09238775805363784, "learning_rate": 1.3407274991484222e-06, "loss": -0.0, "num_tokens": 91064414.0, "reward": 0.8125, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1088 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.967079601050182e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.129032258064516, "grad_norm": 0.09329067748840777, "learning_rate": 1.3396694017461707e-06, "loss": 0.0, "num_tokens": 91148087.0, "reward": 0.703125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 1089 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599521727490371e-09, "advantages/std": 0.40496495366096497, "advantages/var": 0.1639966136936275, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.131899641577061, "grad_norm": 0.10497977844609316, "learning_rate": 1.3386108742876349e-06, "loss": -0.0, "num_tokens": 91211425.0, "reward": 0.859375, "reward_std": 0.0936255231499672, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1090 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6262555319655851e-09, "advantages/std": 0.5726791024208069, "advantages/var": 0.327961354349501, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.1347670250896056, "grad_norm": 0.1294893154667107, "learning_rate": 1.3375519181130192e-06, "loss": 0.0, "num_tokens": 91290971.0, "reward": 0.6484375, "reward_std": 0.15308690071105957, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 1091 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 3.1376344086021506, "grad_norm": 0.07557647834198548, "learning_rate": 1.3364925345630711e-06, "loss": -0.0, "num_tokens": 91375027.0, "reward": 0.875, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1092 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.8788021410185465e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.140501792114695, "grad_norm": 0.1400886797653509, "learning_rate": 1.3354327249790785e-06, "loss": 0.0, "num_tokens": 91462378.0, "reward": 0.8515625, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1093 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 7.527881223669866e-09, "advantages/std": 0.6185821294784546, "advantages/var": 0.38264385091009956, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.14336917562724, "grad_norm": 0.12259425546705846, "learning_rate": 1.334372490702869e-06, "loss": 0.0, "num_tokens": 91549766.0, "reward": 0.859375, "reward_std": 0.20463500916957855, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1094 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.146236559139785, "grad_norm": 0.04587102497418084, "learning_rate": 1.3333118330768082e-06, "loss": 0.0, "num_tokens": 91642091.0, "reward": 0.421875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.421875, "rewards/drgrpo_math_reward/std": 0.4957992732524872, "step": 1095 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.3278481048164687e-09, "advantages/std": 0.7013773322105408, "advantages/var": 0.4919301621387753, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.1491039426523297, "grad_norm": 0.16406640602715863, "learning_rate": 1.3322507534437963e-06, "loss": 0.0, "num_tokens": 91731679.0, "reward": 0.8359375, "reward_std": 0.2214949131011963, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1096 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 3.1519713261648747, "grad_norm": 0.12206538089462148, "learning_rate": 1.3311892531472704e-06, "loss": -0.0, "num_tokens": 91804150.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1097 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983311559814009e-09, "advantages/std": 0.46761220693588257, "advantages/var": 0.21866117607544666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.1548387096774193, "grad_norm": 0.09001265590862512, "learning_rate": 1.3301273335311976e-06, "loss": 0.0, "num_tokens": 91888615.0, "reward": 0.7109375, "reward_std": 0.1236182302236557, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 1098 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199093551714649e-09, "advantages/std": 0.4049627482891083, "advantages/var": 0.16399482750186767, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.1577060931899643, "grad_norm": 0.19512990430006444, "learning_rate": 1.3290649959400775e-06, "loss": -0.0, "num_tokens": 91964406.0, "reward": 0.9375, "reward_std": 0.09127141535282135, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1099 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966888674758758e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.160573476702509, "grad_norm": 0.08814638579810388, "learning_rate": 1.328002241718938e-06, "loss": 0.0, "num_tokens": 92050716.0, "reward": 0.8046875, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1100 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.163440860215054, "grad_norm": 0.09837497675644805, "learning_rate": 1.3269390722133356e-06, "loss": 0.0, "num_tokens": 92125083.0, "reward": 0.890625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1101 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.1663082437275984, "grad_norm": 0.04586394637505574, "learning_rate": 1.325875488769351e-06, "loss": -0.0, "num_tokens": 92198984.0, "reward": 0.8359375, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1102 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 9.199298683221743e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.1691756272401435, "grad_norm": 0.06400873024854346, "learning_rate": 1.3248114927335906e-06, "loss": 0.0, "num_tokens": 92275691.0, "reward": 0.7890625, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1103 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.1292059034235823e-08, "advantages/std": 0.6185691356658936, "advantages/var": 0.3826277755984506, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.172043010752688, "grad_norm": 0.1423659523351042, "learning_rate": 1.3237470854531823e-06, "loss": 0.0, "num_tokens": 92359565.0, "reward": 0.828125, "reward_std": 0.18543371558189392, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1104 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.907386946654693e-10, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.174910394265233, "grad_norm": 0.13585745299902, "learning_rate": 1.3226822682757743e-06, "loss": -0.0, "num_tokens": 92437625.0, "reward": 0.859375, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1105 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983629174425397e-09, "advantages/std": 0.46757492423057556, "advantages/var": 0.21862630976922848, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.1777777777777776, "grad_norm": 0.11995484554528156, "learning_rate": 1.321617042549535e-06, "loss": 0.0, "num_tokens": 92511972.0, "reward": 0.921875, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1106 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.95844518607158e-10, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.1806451612903226, "grad_norm": 0.14138202171271355, "learning_rate": 1.320551409623149e-06, "loss": -0.0, "num_tokens": 92591448.0, "reward": 0.796875, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1107 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.4083929189798527e-09, "advantages/std": 0.6612661480903625, "advantages/var": 0.4372729186102653, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.183512544802867, "grad_norm": 17.339898670927273, "learning_rate": 1.319485370845817e-06, "loss": 0.0, "num_tokens": 92675998.0, "reward": 0.8046875, "reward_std": 0.19833700358867645, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1108 }, { "advantages/mean": 6.51925802230835e-09, "advantages/snr": 1.3941923669503344e-08, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.186379928315412, "grad_norm": 0.11099719938959011, "learning_rate": 1.318418927567253e-06, "loss": 0.0, "num_tokens": 92755810.0, "reward": 0.78125, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1109 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299811976811062e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.189247311827957, "grad_norm": 0.06748169995525603, "learning_rate": 1.3173520811376842e-06, "loss": -0.0, "num_tokens": 92845212.0, "reward": 0.6953125, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 1110 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.2946288161655503e-08, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.1921146953405017, "grad_norm": 0.1729202957432041, "learning_rate": 1.3162848329078468e-06, "loss": -0.0, "num_tokens": 92931747.0, "reward": 0.8828125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1111 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125764130418491e-09, "advantages/std": 0.5227916836738586, "advantages/var": 0.2733111445185479, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 3.1949820788530467, "grad_norm": 0.11420128502253798, "learning_rate": 1.3152171842289869e-06, "loss": -0.0, "num_tokens": 93015564.0, "reward": 0.734375, "reward_std": 0.13994136452674866, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1112 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.0349462367204652e-08, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.1978494623655913, "grad_norm": 0.10964878776919543, "learning_rate": 1.3141491364528575e-06, "loss": 0.0, "num_tokens": 93105876.0, "reward": 0.7734375, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1113 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.2007168458781363, "grad_norm": 0.09936713674235224, "learning_rate": 1.3130806909317155e-06, "loss": 0.0, "num_tokens": 93178808.0, "reward": 0.90625, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1114 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.203584229390681, "grad_norm": 0.08818255744001925, "learning_rate": 1.3120118490183236e-06, "loss": -0.0, "num_tokens": 93260861.0, "reward": 0.890625, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1115 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983473280509385e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.206451612903226, "grad_norm": 0.1029823451744844, "learning_rate": 1.310942612065945e-06, "loss": 0.0, "num_tokens": 93339232.0, "reward": 0.8828125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1116 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599796258942519e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.2093189964157705, "grad_norm": 0.10239693579696584, "learning_rate": 1.3098729814283425e-06, "loss": 0.0, "num_tokens": 93414681.0, "reward": 0.9375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1117 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344503462080032e-09, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.2121863799283155, "grad_norm": 0.11581062256087042, "learning_rate": 1.308802958459779e-06, "loss": 0.0, "num_tokens": 93495167.0, "reward": 0.8984375, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1118 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814536252692192e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.21505376344086, "grad_norm": 0.14209019240601692, "learning_rate": 1.307732544515013e-06, "loss": -0.0, "num_tokens": 93573944.0, "reward": 0.8046875, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1119 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633244590331673e-09, "advantages/std": 0.33065226674079895, "advantages/var": 0.10933092150082846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.217921146953405, "grad_norm": 0.0678371309109891, "learning_rate": 1.3066617409492982e-06, "loss": 0.0, "num_tokens": 93644153.0, "reward": 0.890625, "reward_std": 0.06233953312039375, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1120 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975343977137235e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.22078853046595, "grad_norm": 0.11297932577816154, "learning_rate": 1.3055905491183821e-06, "loss": 0.0, "num_tokens": 93719671.0, "reward": 0.8359375, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1121 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907427569709573e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.2236559139784946, "grad_norm": 0.14507820856849132, "learning_rate": 1.3045189703785023e-06, "loss": -0.0, "num_tokens": 93804920.0, "reward": 0.7109375, "reward_std": 0.12415502220392227, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 1122 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.344496759136123e-09, "advantages/std": 0.5227746963500977, "advantages/var": 0.2732933831439368, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 3.2265232974910396, "grad_norm": 0.08312605561045118, "learning_rate": 1.3034470060863888e-06, "loss": 0.0, "num_tokens": 93876784.0, "reward": 0.890625, "reward_std": 0.12179599702358246, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1123 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.229390681003584, "grad_norm": 0.10769709358908111, "learning_rate": 1.302374657599257e-06, "loss": 0.0, "num_tokens": 93949699.0, "reward": 0.875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1124 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.232258064516129, "grad_norm": 0.05790644165104894, "learning_rate": 1.301301926274811e-06, "loss": 0.0, "num_tokens": 94019155.0, "reward": 0.875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1125 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299802498719973e-09, "advantages/std": 0.4049576222896576, "advantages/var": 0.16399067585049298, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.2351254480286737, "grad_norm": 0.1089893979447905, "learning_rate": 1.300228813471238e-06, "loss": 0.0, "num_tokens": 94096690.0, "reward": 0.9296875, "reward_std": 0.08891239762306213, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1126 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 1.126665165824288e-08, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.2379928315412188, "grad_norm": 0.05358052660575321, "learning_rate": 1.299155320547209e-06, "loss": -0.0, "num_tokens": 94172287.0, "reward": 0.96875, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1127 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.599694025151775e-09, "advantages/std": 0.40494978427886963, "advantages/var": 0.16398432778750305, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.2408602150537633, "grad_norm": 0.08993277827412993, "learning_rate": 1.2980814488618763e-06, "loss": -0.0, "num_tokens": 94254562.0, "reward": 0.8671875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1128 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.2437275985663083, "grad_norm": 0.03741420481122981, "learning_rate": 1.297007199774871e-06, "loss": -0.0, "num_tokens": 94322524.0, "reward": 0.8984375, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1129 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 3.246594982078853, "grad_norm": 0.09576357623917453, "learning_rate": 1.2959325746463035e-06, "loss": 0.0, "num_tokens": 94402604.0, "reward": 0.796875, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1130 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.449667444137735e-09, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 3.249462365591398, "grad_norm": 0.05155173182885348, "learning_rate": 1.2948575748367584e-06, "loss": -0.0, "num_tokens": 94489118.0, "reward": 0.7578125, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1131 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9834009234729675e-09, "advantages/std": 0.4676017165184021, "advantages/var": 0.21865136529095608, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.252329749103943, "grad_norm": 0.11833142962661158, "learning_rate": 1.2937822017072964e-06, "loss": 0.0, "num_tokens": 94565852.0, "reward": 0.8984375, "reward_std": 0.11336849629878998, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1132 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.2551971326164875, "grad_norm": 0.1250304564086963, "learning_rate": 1.2927064566194492e-06, "loss": 0.0, "num_tokens": 94645859.0, "reward": 0.875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1133 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.258064516129032, "grad_norm": 0.0, "learning_rate": 1.2916303409352214e-06, "loss": 0.0, "num_tokens": 94732887.0, "reward": 0.75, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1134 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.63344972278721e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.260931899641577, "grad_norm": 0.0983279558017045, "learning_rate": 1.2905538560170852e-06, "loss": 0.0, "num_tokens": 94814521.0, "reward": 0.84375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1135 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907665222004876e-09, "advantages/std": 0.5227646827697754, "advantages/var": 0.2732829135513839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 3.263799283154122, "grad_norm": 0.10809583758934845, "learning_rate": 1.289477003227981e-06, "loss": 0.0, "num_tokens": 94906213.0, "reward": 0.8984375, "reward_std": 0.11048543453216553, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1136 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.2666666666666666, "grad_norm": 0.0384556429318533, "learning_rate": 1.288399783931315e-06, "loss": -0.0, "num_tokens": 94978971.0, "reward": 0.8984375, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1137 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983371727518391e-09, "advantages/std": 0.4676051437854767, "advantages/var": 0.21865457049463632, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.2695340501792116, "grad_norm": 0.12000069202772719, "learning_rate": 1.287322199490957e-06, "loss": -0.0, "num_tokens": 95062894.0, "reward": 0.8515625, "reward_std": 0.11784426867961884, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1138 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958849501312727e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.272401433691756, "grad_norm": 0.11544441417144902, "learning_rate": 1.2862442512712392e-06, "loss": 0.0, "num_tokens": 95142033.0, "reward": 0.890625, "reward_std": 0.09863808006048203, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1139 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.2998806953079044e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.275268817204301, "grad_norm": 0.12380391127673529, "learning_rate": 1.2851659406369551e-06, "loss": 0.0, "num_tokens": 95213204.0, "reward": 0.9453125, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1140 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1499234216592094e-08, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.2781362007168457, "grad_norm": 0.16194218053487433, "learning_rate": 1.284087268953356e-06, "loss": 0.0, "num_tokens": 95295811.0, "reward": 0.8359375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1141 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380985856056213, "advantages/var": 0.05466704996011007, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.2810035842293908, "grad_norm": 0.05095881919374362, "learning_rate": 1.2830082375861512e-06, "loss": -0.0, "num_tokens": 95373379.0, "reward": 0.90625, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1142 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5228040218353271, "advantages/var": 0.2733240452471932, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 3.2838709677419353, "grad_norm": 0.12303248397130558, "learning_rate": 1.2819288479015047e-06, "loss": 0.0, "num_tokens": 95462214.0, "reward": 0.75, "reward_std": 0.15466687083244324, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1143 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962665787340388e-09, "advantages/std": 0.4676010012626648, "advantages/var": 0.21865069638184664, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.2867383512544803, "grad_norm": 0.09910456100563501, "learning_rate": 1.280849101266035e-06, "loss": 0.0, "num_tokens": 95539641.0, "reward": 0.9375, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1144 }, { "advantages/mean": 5.587935447692871e-09, "advantages/snr": 1.1950152502444473e-08, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.289605734767025, "grad_norm": 0.08736872419534301, "learning_rate": 1.2797689990468112e-06, "loss": -0.0, "num_tokens": 95621771.0, "reward": 0.7421875, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1145 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975144418133046e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.29247311827957, "grad_norm": 0.07349690490924235, "learning_rate": 1.2786885426113544e-06, "loss": 0.0, "num_tokens": 95708411.0, "reward": 0.8515625, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1146 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.295340501792115, "grad_norm": 0.09362213075565909, "learning_rate": 1.2776077333276324e-06, "loss": 0.0, "num_tokens": 95787152.0, "reward": 0.6640625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 1147 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.2982078853046595, "grad_norm": 0.03775176697675935, "learning_rate": 1.276526572564061e-06, "loss": 0.0, "num_tokens": 95871995.0, "reward": 0.6796875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 1148 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.022521435078091e-09, "advantages/std": 0.6185598969459534, "advantages/var": 0.38261634610978845, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.3010752688172045, "grad_norm": 0.17528959621414597, "learning_rate": 1.2754450616895005e-06, "loss": 0.0, "num_tokens": 95954564.0, "reward": 0.84375, "reward_std": 0.17623992264270782, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1149 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4049657881259918, "advantages/var": 0.1639972895525057, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 3.303942652329749, "grad_norm": 0.05405794177356671, "learning_rate": 1.2743632020732548e-06, "loss": -0.0, "num_tokens": 96041684.0, "reward": 0.7265625, "reward_std": 0.0946863517165184, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 1150 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299846843318419e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.306810035842294, "grad_norm": 0.0745366325775834, "learning_rate": 1.2732809950850683e-06, "loss": -0.0, "num_tokens": 96114505.0, "reward": 0.8046875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1151 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.3096774193548386, "grad_norm": 0.10078408790459324, "learning_rate": 1.2721984420951268e-06, "loss": 0.0, "num_tokens": 96190659.0, "reward": 0.9296875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1152 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.3125448028673836, "grad_norm": 0.07386438095189551, "learning_rate": 1.2711155444740526e-06, "loss": 0.0, "num_tokens": 96261026.0, "reward": 0.890625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1153 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.315412186379928, "grad_norm": 0.04510258925922734, "learning_rate": 1.2700323035929062e-06, "loss": 0.0, "num_tokens": 96337551.0, "reward": 0.765625, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1154 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633244590331673e-09, "advantages/std": 0.33065226674079895, "advantages/var": 0.10933092150082846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.318279569892473, "grad_norm": 0.05165231576261721, "learning_rate": 1.2689487208231805e-06, "loss": -0.0, "num_tokens": 96414069.0, "reward": 0.828125, "reward_std": 0.06233953312039375, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1155 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562938529588136e-09, "advantages/std": 0.5227833986282349, "advantages/var": 0.2733024818812879, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.3211469534050178, "grad_norm": 0.1420159424855486, "learning_rate": 1.267864797536803e-06, "loss": -0.0, "num_tokens": 96490338.0, "reward": 0.7265625, "reward_std": 0.13098490238189697, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 1156 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958683201273463e-10, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.3240143369175628, "grad_norm": 0.12135563806747647, "learning_rate": 1.2667805351061312e-06, "loss": -0.0, "num_tokens": 96570241.0, "reward": 0.6328125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 1157 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1498692618242365e-08, "advantages/std": 0.40496888756752014, "advantages/var": 0.16399979989767477, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.3268817204301078, "grad_norm": 0.10155317067010135, "learning_rate": 1.265695934903953e-06, "loss": 0.0, "num_tokens": 96654085.0, "reward": 0.796875, "reward_std": 0.09810129553079605, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1158 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.3297491039426523, "grad_norm": 0.057359457438611275, "learning_rate": 1.2646109983034832e-06, "loss": 0.0, "num_tokens": 96736103.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 1159 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.332616487455197, "grad_norm": 0.10192389899377412, "learning_rate": 1.263525726678363e-06, "loss": -0.0, "num_tokens": 96826847.0, "reward": 0.6796875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 1160 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.4497441148988883e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.335483870967742, "grad_norm": 0.09375847790139395, "learning_rate": 1.2624401214026572e-06, "loss": 0.0, "num_tokens": 96904825.0, "reward": 0.78125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1161 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.75738695226396e-09, "advantages/std": 0.5726876854896545, "advantages/var": 0.3279711851114975, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.338351254480287, "grad_norm": 0.12074444773595139, "learning_rate": 1.2613541838508535e-06, "loss": 0.0, "num_tokens": 96982233.0, "reward": 0.8515625, "reward_std": 0.1633366346359253, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1162 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.504965933612274e-09, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 3.3412186379928315, "grad_norm": 0.10123460756262612, "learning_rate": 1.26026791539786e-06, "loss": -0.0, "num_tokens": 97068903.0, "reward": 0.75, "reward_std": 0.16097761690616608, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1163 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.3440860215053765, "grad_norm": 0.06230410181062095, "learning_rate": 1.2591813174190044e-06, "loss": -0.0, "num_tokens": 97154759.0, "reward": 0.8125, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1164 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.346953405017921, "grad_norm": 0.06771623444997626, "learning_rate": 1.2580943912900308e-06, "loss": 0.0, "num_tokens": 97228764.0, "reward": 0.84375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1165 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262475767256781e-09, "advantages/std": 0.5726819038391113, "advantages/var": 0.32796456298478915, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 3.349820788530466, "grad_norm": 0.1288512683886992, "learning_rate": 1.2570071383870988e-06, "loss": 0.0, "num_tokens": 97324635.0, "reward": 0.5859375, "reward_std": 0.15756267309188843, "rewards/drgrpo_math_reward/mean": 0.5859375, "rewards/drgrpo_math_reward/std": 0.49449479579925537, "step": 1166 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6185553073883057, "advantages/var": 0.3826106682982413, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.3526881720430106, "grad_norm": 0.1359847136820415, "learning_rate": 1.255919560086783e-06, "loss": 0.0, "num_tokens": 97405487.0, "reward": 0.7421875, "reward_std": 0.16834917664527893, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1167 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983628920516591e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.3555555555555556, "grad_norm": 0.08508583268728266, "learning_rate": 1.2548316577660676e-06, "loss": 0.0, "num_tokens": 97478621.0, "reward": 0.859375, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1168 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.8788747562343056e-09, "advantages/std": 0.5726664066314697, "advantages/var": 0.32794681328419983, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.3584229390681, "grad_norm": 0.13029888196956377, "learning_rate": 1.25374343280235e-06, "loss": 0.0, "num_tokens": 97555214.0, "reward": 0.9140625, "reward_std": 0.13941732048988342, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1169 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9835626514248234e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.361290322580645, "grad_norm": 0.14925387660351838, "learning_rate": 1.2526548865734334e-06, "loss": 0.0, "num_tokens": 97644243.0, "reward": 0.8671875, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1170 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.2649061754629257e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.3641577060931898, "grad_norm": 0.16881070944340243, "learning_rate": 1.2515660204575295e-06, "loss": 0.0, "num_tokens": 97713405.0, "reward": 0.890625, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1171 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.3670250896057348, "grad_norm": 0.07447657569529018, "learning_rate": 1.2504768358332543e-06, "loss": -0.0, "num_tokens": 97794436.0, "reward": 0.8046875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1172 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.3698924731182798, "grad_norm": 0.11325414232721744, "learning_rate": 1.249387334079627e-06, "loss": 0.0, "num_tokens": 97877806.0, "reward": 0.8203125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1173 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.967257841033182e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.3727598566308243, "grad_norm": 0.09120860078900074, "learning_rate": 1.2482975165760687e-06, "loss": -0.0, "num_tokens": 97955036.0, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1174 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975343596286746e-09, "advantages/std": 0.4675827622413635, "advantages/var": 0.2186336395452635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.3756272401433693, "grad_norm": 0.10773030424525183, "learning_rate": 1.2472073847024004e-06, "loss": -0.0, "num_tokens": 98022372.0, "reward": 0.9140625, "reward_std": 0.09522313624620438, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1175 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344456541825744e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.378494623655914, "grad_norm": 0.15865280731250242, "learning_rate": 1.24611693983884e-06, "loss": 0.0, "num_tokens": 98095593.0, "reward": 0.8828125, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1176 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199591840825068e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.381362007168459, "grad_norm": 0.08123097491556498, "learning_rate": 1.245026183366003e-06, "loss": 0.0, "num_tokens": 98170363.0, "reward": 0.96875, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1177 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.8788209293635885e-09, "advantages/std": 0.5726727247238159, "advantages/var": 0.32795404964259944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 3.3842293906810035, "grad_norm": 0.11612838837917279, "learning_rate": 1.2439351166648992e-06, "loss": -0.0, "num_tokens": 98258497.0, "reward": 0.765625, "reward_std": 0.1462521106004715, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1178 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917366402546925e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.3870967741935485, "grad_norm": 0.13617074164091159, "learning_rate": 1.2428437411169303e-06, "loss": -0.0, "num_tokens": 98343751.0, "reward": 0.8203125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1179 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.389964157706093, "grad_norm": 0.07805308280038463, "learning_rate": 1.2417520581038901e-06, "loss": 0.0, "num_tokens": 98420827.0, "reward": 0.9296875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1180 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983628920516591e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.392831541218638, "grad_norm": 0.12308896762162003, "learning_rate": 1.2406600690079608e-06, "loss": 0.0, "num_tokens": 98503170.0, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1181 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.3956989247311826, "grad_norm": 0.08901455185337721, "learning_rate": 1.2395677752117126e-06, "loss": 0.0, "num_tokens": 98582303.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 1182 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.131055087374661e-10, "advantages/std": 0.5726947784423828, "advantages/var": 0.32797930925516994, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.3985663082437276, "grad_norm": 0.1257490696194946, "learning_rate": 1.2384751780981017e-06, "loss": -0.0, "num_tokens": 98670905.0, "reward": 0.6953125, "reward_std": 0.17464229464530945, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 1183 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.5050695213580615e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.4014336917562726, "grad_norm": 0.1286335029069769, "learning_rate": 1.2373822790504681e-06, "loss": 0.0, "num_tokens": 98755275.0, "reward": 0.8828125, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1184 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4049627184867859, "advantages/var": 0.1639948033642078, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.404301075268817, "grad_norm": 0.08057067854185818, "learning_rate": 1.236289079452534e-06, "loss": 0.0, "num_tokens": 98838254.0, "reward": 0.84375, "reward_std": 0.09127141535282135, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1185 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.4071684587813618, "grad_norm": 0.08605115730986439, "learning_rate": 1.2351955806884014e-06, "loss": 0.0, "num_tokens": 98914452.0, "reward": 0.8828125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1186 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.4100358422939068, "grad_norm": 0.026348135341604267, "learning_rate": 1.234101784142553e-06, "loss": 0.0, "num_tokens": 98990483.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1187 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.5629027821475993e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.412903225806452, "grad_norm": 0.07819569401189805, "learning_rate": 1.2330076911998463e-06, "loss": -0.0, "num_tokens": 99083265.0, "reward": 0.84375, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1188 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.4157706093189963, "grad_norm": 0.09006369062795917, "learning_rate": 1.2319133032455162e-06, "loss": 0.0, "num_tokens": 99155919.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1189 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958683201273463e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.4186379928315414, "grad_norm": 0.10137794543828527, "learning_rate": 1.230818621665169e-06, "loss": 0.0, "num_tokens": 99241101.0, "reward": 0.8203125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1190 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.421505376344086, "grad_norm": 0.04979362374814982, "learning_rate": 1.2297236478447845e-06, "loss": -0.0, "num_tokens": 99325116.0, "reward": 0.8828125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1191 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.1950488391937882e-08, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.424372759856631, "grad_norm": 0.11878324664587483, "learning_rate": 1.2286283831707114e-06, "loss": 0.0, "num_tokens": 99396918.0, "reward": 0.828125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1192 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7815011540266774e-09, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.4272401433691755, "grad_norm": 0.12176590117312114, "learning_rate": 1.2275328290296676e-06, "loss": 0.0, "num_tokens": 99477109.0, "reward": 0.8828125, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1193 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 6.899627360122966e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.4301075268817205, "grad_norm": 0.12056498969551692, "learning_rate": 1.2264369868087364e-06, "loss": -0.0, "num_tokens": 99551171.0, "reward": 0.828125, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1194 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.1498780624502616e-09, "advantages/std": 0.4049657881259918, "advantages/var": 0.1639972895525057, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.432974910394265, "grad_norm": 0.12740750311776367, "learning_rate": 1.2253408578953666e-06, "loss": -0.0, "num_tokens": 99630765.0, "reward": 0.8515625, "reward_std": 0.094686359167099, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1195 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199247907244247e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.43584229390681, "grad_norm": 0.08831406153890661, "learning_rate": 1.2242444436773695e-06, "loss": 0.0, "num_tokens": 99711000.0, "reward": 0.8203125, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1196 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.1499403476539522e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.4387096774193546, "grad_norm": 11.03990611559521, "learning_rate": 1.2231477455429185e-06, "loss": 0.0, "num_tokens": 99784870.0, "reward": 0.9453125, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1197 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016541313711486e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 3.4415770609318996, "grad_norm": 0.10249676734220825, "learning_rate": 1.2220507648805454e-06, "loss": 0.0, "num_tokens": 99867366.0, "reward": 0.6953125, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 1198 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907386946654693e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.4444444444444446, "grad_norm": 0.14083166742954242, "learning_rate": 1.2209535030791402e-06, "loss": 0.0, "num_tokens": 99957869.0, "reward": 0.734375, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1199 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562923093105361e-09, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.447311827956989, "grad_norm": 0.1092072729801531, "learning_rate": 1.219855961527949e-06, "loss": 0.0, "num_tokens": 100034471.0, "reward": 0.890625, "reward_std": 0.13098981976509094, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1200 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726664066314697, "advantages/var": 0.32794681328419983, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.450179211469534, "grad_norm": 0.09284326648205744, "learning_rate": 1.218758141616572e-06, "loss": 0.0, "num_tokens": 100119571.0, "reward": 0.8203125, "reward_std": 0.13941732048988342, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1201 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.958516906788102e-10, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.4530465949820788, "grad_norm": 0.09633772102145596, "learning_rate": 1.2176600447349615e-06, "loss": 0.0, "num_tokens": 100201105.0, "reward": 0.84375, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1202 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.455913978494624, "grad_norm": 0.06799586748471112, "learning_rate": 1.216561672273421e-06, "loss": 0.0, "num_tokens": 100291427.0, "reward": 0.84375, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1203 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.5629027821475993e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.4587813620071683, "grad_norm": 0.12981349922544916, "learning_rate": 1.2154630256226021e-06, "loss": -0.0, "num_tokens": 100375437.0, "reward": 0.796875, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1204 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.3009849269685278e-08, "advantages/std": 0.5726876854896545, "advantages/var": 0.3279711851114975, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.4616487455197134, "grad_norm": 0.14342295542203048, "learning_rate": 1.2143641061735045e-06, "loss": -0.0, "num_tokens": 100463182.0, "reward": 0.6796875, "reward_std": 0.1633366346359253, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 1205 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.464516129032258, "grad_norm": 0.11517897259066424, "learning_rate": 1.2132649153174732e-06, "loss": -0.0, "num_tokens": 100549042.0, "reward": 0.7890625, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1206 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.96693437436781e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.467383512544803, "grad_norm": 0.09397773898658479, "learning_rate": 1.2121654544461958e-06, "loss": -0.0, "num_tokens": 100638467.0, "reward": 0.8125, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1207 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.4251693997307513e-08, "advantages/std": 0.5227856040000916, "advantages/var": 0.27330478774974054, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.4702508960573475, "grad_norm": 0.1519680821743502, "learning_rate": 1.2110657249517028e-06, "loss": -0.0, "num_tokens": 100732338.0, "reward": 0.75, "reward_std": 0.13098981976509094, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1208 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.439337344384284e-09, "advantages/std": 0.5726898908615112, "advantages/var": 0.32797371109496964, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.4731182795698925, "grad_norm": 0.10906049011686104, "learning_rate": 1.209965728226365e-06, "loss": 0.0, "num_tokens": 100816224.0, "reward": 0.671875, "reward_std": 0.16675156354904175, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 1209 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.4759856630824375, "grad_norm": 0.1486381160895392, "learning_rate": 1.2088654656628898e-06, "loss": 0.0, "num_tokens": 100894492.0, "reward": 0.890625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1210 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.8996420859237135e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.478853046594982, "grad_norm": 0.1259585465878865, "learning_rate": 1.2077649386543236e-06, "loss": 0.0, "num_tokens": 100968092.0, "reward": 0.8984375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1211 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.481720430107527, "grad_norm": 0.14289506144752373, "learning_rate": 1.2066641485940456e-06, "loss": 0.0, "num_tokens": 101048864.0, "reward": 0.671875, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 1212 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899540529955257e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.4845878136200716, "grad_norm": 0.08574260029696501, "learning_rate": 1.2055630968757695e-06, "loss": -0.0, "num_tokens": 101115472.0, "reward": 0.8515625, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1213 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966813525430481e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.4874551971326166, "grad_norm": 0.1146376448735616, "learning_rate": 1.2044617848935392e-06, "loss": 0.0, "num_tokens": 101190115.0, "reward": 0.9375, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1214 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.449710856633628e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.490322580645161, "grad_norm": 0.08397441447009728, "learning_rate": 1.2033602140417287e-06, "loss": -0.0, "num_tokens": 101259449.0, "reward": 0.75, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1215 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.493189964157706, "grad_norm": 0.12488189256210573, "learning_rate": 1.2022583857150396e-06, "loss": 0.0, "num_tokens": 101338805.0, "reward": 0.9140625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1216 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962764040172268e-09, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.496057347670251, "grad_norm": 0.15252039222909725, "learning_rate": 1.2011563013084996e-06, "loss": 0.0, "num_tokens": 101418877.0, "reward": 0.890625, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1217 }, { "advantages/mean": 5.122274160385132e-09, "advantages/snr": 8.944094503700182e-09, "advantages/std": 0.5726990103721619, "advantages/var": 0.32798415648125356, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.498924731182796, "grad_norm": 0.14327275724747895, "learning_rate": 1.2000539622174607e-06, "loss": -0.0, "num_tokens": 101493230.0, "reward": 0.6015625, "reward_std": 0.17806214094161987, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4915000796318054, "step": 1218 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.5017921146953404, "grad_norm": 0.07537604610157368, "learning_rate": 1.1989513698375965e-06, "loss": 0.0, "num_tokens": 101576438.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1219 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.0655021560480435e-09, "advantages/std": 0.5726983547210693, "advantages/var": 0.32798340550021976, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.5046594982078854, "grad_norm": 0.13579145137213305, "learning_rate": 1.1978485255649032e-06, "loss": 0.0, "num_tokens": 101659133.0, "reward": 0.71875, "reward_std": 0.17700131237506866, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 1220 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.5049341132024195e-09, "advantages/std": 0.5726868510246277, "advantages/var": 0.3279702293365041, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.5075268817204304, "grad_norm": 0.12683903413547704, "learning_rate": 1.1967454307956932e-06, "loss": 0.0, "num_tokens": 101737395.0, "reward": 0.828125, "reward_std": 0.16545340418815613, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1221 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975166886896984e-09, "advantages/std": 0.4675965905189514, "advantages/var": 0.21864657146494793, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.510394265232975, "grad_norm": 0.09319276000313177, "learning_rate": 1.195642086926599e-06, "loss": 0.0, "num_tokens": 101813461.0, "reward": 0.7578125, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1222 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.96694656101877e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.5132616487455195, "grad_norm": 0.10925697498476222, "learning_rate": 1.1945384953545658e-06, "loss": 0.0, "num_tokens": 101882370.0, "reward": 0.9140625, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1223 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958610843448447e-10, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.5161290322580645, "grad_norm": 0.1334681175867728, "learning_rate": 1.1934346574768547e-06, "loss": -0.0, "num_tokens": 101969857.0, "reward": 0.8671875, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1224 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.5628430692729714e-09, "advantages/std": 0.5227974057197571, "advantages/var": 0.2733171274273083, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.5189964157706095, "grad_norm": 0.17314748462184348, "learning_rate": 1.192330574691037e-06, "loss": -0.0, "num_tokens": 102052362.0, "reward": 0.7109375, "reward_std": 0.14465448260307312, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 1225 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125782003796406e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.521863799283154, "grad_norm": 0.2015930023571842, "learning_rate": 1.191226248394995e-06, "loss": 0.0, "num_tokens": 102135090.0, "reward": 0.875, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1226 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.439373903985093e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.524731182795699, "grad_norm": 0.1360875524771212, "learning_rate": 1.1901216799869188e-06, "loss": -0.0, "num_tokens": 102221610.0, "reward": 0.8125, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1227 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.9875550720364307e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.5275985663082436, "grad_norm": 0.13875885976895325, "learning_rate": 1.1890168708653053e-06, "loss": 0.0, "num_tokens": 102298993.0, "reward": 0.859375, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1228 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.672251731040016e-09, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.5304659498207887, "grad_norm": 0.14595977161720528, "learning_rate": 1.1879118224289561e-06, "loss": 0.0, "num_tokens": 102390020.0, "reward": 0.8046875, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1229 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.8994882297977766e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 3.533333333333333, "grad_norm": 0.1415508927459141, "learning_rate": 1.1868065360769758e-06, "loss": -0.0, "num_tokens": 102468791.0, "reward": 0.828125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1230 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899474012416308e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.5362007168458782, "grad_norm": 0.09199765758500333, "learning_rate": 1.1857010132087704e-06, "loss": 0.0, "num_tokens": 102545313.0, "reward": 0.8828125, "reward_std": 0.08443662524223328, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1231 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.749445740229558e-09, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.539068100358423, "grad_norm": 0.07560841262380356, "learning_rate": 1.1845952552240448e-06, "loss": -0.0, "num_tokens": 102626121.0, "reward": 0.8515625, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1232 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983473280509385e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.541935483870968, "grad_norm": 0.1342972638035452, "learning_rate": 1.1834892635228022e-06, "loss": 0.0, "num_tokens": 102712264.0, "reward": 0.8671875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1233 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814455009491016e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.5448028673835124, "grad_norm": 0.13968579486800936, "learning_rate": 1.1823830395053416e-06, "loss": -0.0, "num_tokens": 102789811.0, "reward": 0.8125, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1234 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.5476702508960574, "grad_norm": 0.13576783036106513, "learning_rate": 1.1812765845722559e-06, "loss": 0.0, "num_tokens": 102868670.0, "reward": 0.921875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1235 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9835626514248234e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 3.5505376344086024, "grad_norm": 0.131430380104203, "learning_rate": 1.1801699001244304e-06, "loss": 0.0, "num_tokens": 102951361.0, "reward": 0.8984375, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1236 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 1.0349153895649778e-08, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.553405017921147, "grad_norm": 0.0740815983049289, "learning_rate": 1.1790629875630412e-06, "loss": -0.0, "num_tokens": 103026292.0, "reward": 0.7421875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1237 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.5562724014336915, "grad_norm": 0.15201122837467138, "learning_rate": 1.1779558482895528e-06, "loss": -0.0, "num_tokens": 103109879.0, "reward": 0.84375, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1238 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814371735978847e-09, "advantages/std": 0.5227928161621094, "advantages/var": 0.2733123286307091, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.5591397849462365, "grad_norm": 0.142543365316104, "learning_rate": 1.1768484837057175e-06, "loss": 0.0, "num_tokens": 103195818.0, "reward": 0.7734375, "reward_std": 0.1412346363067627, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1239 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.3444010916782605e-09, "advantages/std": 0.5227840542793274, "advantages/var": 0.27330316740873073, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.5620071684587815, "grad_norm": 0.1421483267478702, "learning_rate": 1.1757408952135722e-06, "loss": 0.0, "num_tokens": 103277159.0, "reward": 0.734375, "reward_std": 0.13204573094844818, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1240 }, { "advantages/mean": -8.381903171539307e-09, "advantages/snr": 1.463624342391056e-08, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.564874551971326, "grad_norm": 0.11733338278671168, "learning_rate": 1.174633084215437e-06, "loss": 0.0, "num_tokens": 103359879.0, "reward": 0.921875, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1241 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.971078240891425e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.567741935483871, "grad_norm": 0.0809665211104052, "learning_rate": 1.1735250521139148e-06, "loss": 0.0, "num_tokens": 103437878.0, "reward": 0.8359375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1242 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814134106090637e-09, "advantages/std": 0.5227997899055481, "advantages/var": 0.27331962032528523, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.5706093189964156, "grad_norm": 0.11570311984898715, "learning_rate": 1.1724168003118874e-06, "loss": -0.0, "num_tokens": 103524530.0, "reward": 0.765625, "reward_std": 0.14806944131851196, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1243 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899474012416308e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.5734767025089607, "grad_norm": 0.1183125986099129, "learning_rate": 1.1713083302125158e-06, "loss": -0.0, "num_tokens": 103597931.0, "reward": 0.8203125, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1244 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6185632348060608, "advantages/var": 0.3826204754537379, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.576344086021505, "grad_norm": 0.1298317431898703, "learning_rate": 1.170199643219236e-06, "loss": 0.0, "num_tokens": 103688988.0, "reward": 0.7578125, "reward_std": 0.17859892547130585, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1245 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.5792114695340502, "grad_norm": 0.08847302241965387, "learning_rate": 1.16909074073576e-06, "loss": 0.0, "num_tokens": 103761600.0, "reward": 0.9765625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9765625, "rewards/drgrpo_math_reward/std": 0.15188287198543549, "step": 1246 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199094228701277e-09, "advantages/std": 0.4049627184867859, "advantages/var": 0.1639948033642078, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.5820788530465952, "grad_norm": 0.0733599792761376, "learning_rate": 1.1679816241660717e-06, "loss": -0.0, "num_tokens": 103844973.0, "reward": 0.5625, "reward_std": 0.09127141535282135, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49802759289741516, "step": 1247 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878747807970186e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.58494623655914, "grad_norm": 0.16772127285058927, "learning_rate": 1.1668722949144266e-06, "loss": 0.0, "num_tokens": 103930651.0, "reward": 0.875, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1248 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.449710856633628e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.5878136200716844, "grad_norm": 0.12085872474781346, "learning_rate": 1.165762754385349e-06, "loss": 0.0, "num_tokens": 104005167.0, "reward": 0.8125, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1249 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599795920412534e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.5906810035842294, "grad_norm": 0.08844179403853, "learning_rate": 1.1646530039836311e-06, "loss": 0.0, "num_tokens": 104083834.0, "reward": 0.859375, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1250 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.252531375408196e-09, "advantages/std": 0.5726755261421204, "advantages/var": 0.3279572582421544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.5935483870967744, "grad_norm": 0.12119886337231127, "learning_rate": 1.1635430451143307e-06, "loss": 0.0, "num_tokens": 104162273.0, "reward": 0.859375, "reward_std": 0.15072786808013916, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1251 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.131418995429572e-09, "advantages/std": 0.5726691484451294, "advantages/var": 0.32794995358086965, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.596415770609319, "grad_norm": 0.1272812772182248, "learning_rate": 1.1624328791827696e-06, "loss": 0.0, "num_tokens": 104235263.0, "reward": 0.8828125, "reward_std": 0.14389309287071228, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1252 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.599283154121864, "grad_norm": 0.03398207896287452, "learning_rate": 1.1613225075945314e-06, "loss": 0.0, "num_tokens": 104321147.0, "reward": 0.6796875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 1253 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.6021505376344085, "grad_norm": 0.08432849289207688, "learning_rate": 1.1602119317554603e-06, "loss": 0.0, "num_tokens": 104396372.0, "reward": 0.953125, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 1254 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.131246346616979e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.6050179211469535, "grad_norm": 0.12084258854824852, "learning_rate": 1.15910115307166e-06, "loss": 0.0, "num_tokens": 104476369.0, "reward": 0.78125, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1255 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.607885304659498, "grad_norm": 0.08778679284878087, "learning_rate": 1.1579901729494889e-06, "loss": 0.0, "num_tokens": 104545444.0, "reward": 0.96875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1256 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.610752688172043, "grad_norm": 0.03071083150850874, "learning_rate": 1.156878992795563e-06, "loss": -0.0, "num_tokens": 104627475.0, "reward": 0.796875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1257 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 1.4082993428404723e-08, "advantages/std": 0.330655038356781, "advantages/var": 0.10933275439072432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.6136200716845877, "grad_norm": 0.08635758229878303, "learning_rate": 1.155767614016749e-06, "loss": -0.0, "num_tokens": 104711766.0, "reward": 0.671875, "reward_std": 0.0646936446428299, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 1258 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.516765728025316e-09, "advantages/std": 0.6185770630836487, "advantages/var": 0.3826375829731923, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.6164874551971327, "grad_norm": 0.12554710188999404, "learning_rate": 1.1546560380201678e-06, "loss": 0.0, "num_tokens": 104805976.0, "reward": 0.75, "reward_std": 0.19568344950675964, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1259 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599467569791755e-09, "advantages/std": 0.404969722032547, "advantages/var": 0.16400047576311838, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.6193548387096772, "grad_norm": 0.08417377447220432, "learning_rate": 1.1535442662131873e-06, "loss": 0.0, "num_tokens": 104883446.0, "reward": 0.9140625, "reward_std": 0.09916213154792786, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1260 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 6.899540529955257e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.6222222222222222, "grad_norm": 0.1296610194185722, "learning_rate": 1.1524323000034254e-06, "loss": -0.0, "num_tokens": 104972560.0, "reward": 0.8359375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1261 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.3942077395823529e-08, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.6250896057347672, "grad_norm": 0.0770382463476364, "learning_rate": 1.151320140798745e-06, "loss": 0.0, "num_tokens": 105053986.0, "reward": 0.8125, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1262 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.627956989247312, "grad_norm": 0.058208183836016815, "learning_rate": 1.1502077900072533e-06, "loss": 0.0, "num_tokens": 105124167.0, "reward": 0.90625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1263 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 7.966813525430481e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.6308243727598564, "grad_norm": 0.11667705362885708, "learning_rate": 1.1490952490373012e-06, "loss": -0.0, "num_tokens": 105209877.0, "reward": 0.828125, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1264 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.5167470134271535e-09, "advantages/std": 0.618579626083374, "advantages/var": 0.3826407538054468, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 3.6336917562724014, "grad_norm": 0.1565468928481836, "learning_rate": 1.147982519297479e-06, "loss": 0.0, "num_tokens": 105297314.0, "reward": 0.765625, "reward_std": 0.2001592218875885, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1265 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.816724861393605e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.6365591397849464, "grad_norm": 0.12059672816300006, "learning_rate": 1.1468696021966171e-06, "loss": -0.0, "num_tokens": 105365104.0, "reward": 0.953125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 1266 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.639426523297491, "grad_norm": 0.11550535544705419, "learning_rate": 1.1457564991437823e-06, "loss": -0.0, "num_tokens": 105442599.0, "reward": 0.8125, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1267 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983473280509385e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.642293906810036, "grad_norm": 0.08574098513112333, "learning_rate": 1.1446432115482772e-06, "loss": 0.0, "num_tokens": 105514011.0, "reward": 0.9453125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1268 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.6451612903225805, "grad_norm": 0.05984439476964636, "learning_rate": 1.143529740819638e-06, "loss": 0.0, "num_tokens": 105593332.0, "reward": 0.875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1269 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.6480286738351255, "grad_norm": 0.11318223010227013, "learning_rate": 1.1424160883676332e-06, "loss": 0.0, "num_tokens": 105674573.0, "reward": 0.7578125, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1270 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.505601632872161e-09, "advantages/std": 0.6185716986656189, "advantages/var": 0.3826309463900692, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.65089605734767, "grad_norm": 0.18279507068056253, "learning_rate": 1.1413022556022606e-06, "loss": 0.0, "num_tokens": 105756016.0, "reward": 0.828125, "reward_std": 0.18990948796272278, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1271 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299875955969384e-09, "advantages/std": 0.40494468808174133, "advantages/var": 0.16398020040561878, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.653763440860215, "grad_norm": 0.07169495289737582, "learning_rate": 1.1401882439337464e-06, "loss": -0.0, "num_tokens": 105835971.0, "reward": 0.890625, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1272 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.3009992800507722e-08, "advantages/std": 0.5726813673973083, "advantages/var": 0.32796394856405087, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.65663082437276, "grad_norm": 0.13847441778441305, "learning_rate": 1.1390740547725442e-06, "loss": 0.0, "num_tokens": 105919242.0, "reward": 0.75, "reward_std": 0.15650182962417603, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1273 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599512249801046e-09, "advantages/std": 0.4049657881259918, "advantages/var": 0.1639972895525057, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.6594982078853047, "grad_norm": 0.12531312744111278, "learning_rate": 1.1379596895293314e-06, "loss": -0.0, "num_tokens": 105994890.0, "reward": 0.9140625, "reward_std": 0.0946863517165184, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1274 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.6623655913978492, "grad_norm": 0.15310330840994021, "learning_rate": 1.1368451496150087e-06, "loss": 0.0, "num_tokens": 106074174.0, "reward": 0.7265625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 1275 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.6652329749103942, "grad_norm": 0.1268705759404372, "learning_rate": 1.1357304364406978e-06, "loss": -0.0, "num_tokens": 106168247.0, "reward": 0.90625, "reward_std": 0.13098980486392975, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1276 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.0349462367204652e-08, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.6681003584229392, "grad_norm": 0.08277826866888821, "learning_rate": 1.1346155514177398e-06, "loss": 0.0, "num_tokens": 106237965.0, "reward": 0.8359375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1277 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917813257124117e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.670967741935484, "grad_norm": 0.1021870065331042, "learning_rate": 1.1335004959576932e-06, "loss": 0.0, "num_tokens": 106318420.0, "reward": 0.5078125, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.5078125, "rewards/drgrpo_math_reward/std": 0.5019033551216125, "step": 1278 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.673835125448029, "grad_norm": 0.018836866369400712, "learning_rate": 1.1323852714723335e-06, "loss": -0.0, "num_tokens": 106397754.0, "reward": 0.796875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1279 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917813257124117e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.6767025089605734, "grad_norm": 0.14637770234607897, "learning_rate": 1.131269879373648e-06, "loss": -0.0, "num_tokens": 106479388.0, "reward": 0.8203125, "reward_std": 0.09522313624620438, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1280 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.4083760350291931e-09, "advantages/std": 0.6612740755081177, "advantages/var": 0.4372834029391157, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.6795698924731184, "grad_norm": 0.16404051028339522, "learning_rate": 1.1301543210738382e-06, "loss": 0.0, "num_tokens": 106552798.0, "reward": 0.75, "reward_std": 0.20964756608009338, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1281 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814773893309386e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.682437275985663, "grad_norm": 0.17002361057183865, "learning_rate": 1.1290385979853151e-06, "loss": -0.0, "num_tokens": 106627951.0, "reward": 0.859375, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1282 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.330655038356781, "advantages/var": 0.10933275439072432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.685304659498208, "grad_norm": 0.1620477574415513, "learning_rate": 1.1279227115206986e-06, "loss": -0.0, "num_tokens": 106704949.0, "reward": 0.9375, "reward_std": 0.0646936446428299, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1283 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.033618501892656e-09, "advantages/std": 0.6185711026191711, "advantages/var": 0.38263020899549716, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 3.688172043010753, "grad_norm": 0.15870349587849134, "learning_rate": 1.126806663092815e-06, "loss": 0.0, "num_tokens": 106798802.0, "reward": 0.7109375, "reward_std": 0.18884867429733276, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 1284 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344407185044788e-09, "advantages/std": 0.5227834582328796, "advantages/var": 0.273302544201929, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.6910394265232975, "grad_norm": 0.1153759937149323, "learning_rate": 1.1256904541146965e-06, "loss": -0.0, "num_tokens": 106888194.0, "reward": 0.7109375, "reward_std": 0.13098490238189697, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 1285 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.970882307601416e-09, "advantages/std": 0.4676063656806946, "advantages/var": 0.21865571322510746, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 3.693906810035842, "grad_norm": 0.09479971517786234, "learning_rate": 1.124574085999578e-06, "loss": 0.0, "num_tokens": 106966067.0, "reward": 0.84375, "reward_std": 0.11913755536079407, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1286 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.696774193548387, "grad_norm": 0.0636527001797721, "learning_rate": 1.1234575601608955e-06, "loss": -0.0, "num_tokens": 107047911.0, "reward": 0.9609375, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 1287 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9834382441424445e-09, "advantages/std": 0.4675973355770111, "advantages/var": 0.21864726823871994, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.699641577060932, "grad_norm": 0.15409206568396186, "learning_rate": 1.1223408780122859e-06, "loss": 0.0, "num_tokens": 107127238.0, "reward": 0.921875, "reward_std": 0.11100947856903076, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1288 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.5995914574054795e-09, "advantages/std": 0.4049588143825531, "advantages/var": 0.1639916413461231, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.7025089605734767, "grad_norm": 0.081940982852503, "learning_rate": 1.1212240409675824e-06, "loss": 0.0, "num_tokens": 107211703.0, "reward": 0.859375, "reward_std": 0.0867956355214119, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1289 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899474012416308e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.7053763440860212, "grad_norm": 0.07718402860658599, "learning_rate": 1.120107050440816e-06, "loss": -0.0, "num_tokens": 107297379.0, "reward": 0.8828125, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1290 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299829409932592e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.7082437275985662, "grad_norm": 0.16432220896658642, "learning_rate": 1.1189899078462106e-06, "loss": 0.0, "num_tokens": 107369068.0, "reward": 0.859375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1291 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016360347740786e-09, "advantages/std": 0.5227997899055481, "advantages/var": 0.27331962032528523, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.7111111111111112, "grad_norm": 0.1053941974370414, "learning_rate": 1.117872614598184e-06, "loss": 0.0, "num_tokens": 107457753.0, "reward": 0.765625, "reward_std": 0.14806942641735077, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1292 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962665216109293e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.713978494623656, "grad_norm": 0.11531907034017322, "learning_rate": 1.1167551721113434e-06, "loss": -0.0, "num_tokens": 107536743.0, "reward": 0.859375, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1293 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.065623173308489e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 3.716845878136201, "grad_norm": 0.09979824294231059, "learning_rate": 1.1156375818004855e-06, "loss": 0.0, "num_tokens": 107627363.0, "reward": 0.84375, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1294 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 7.528079978419126e-09, "advantages/std": 0.6185657978057861, "advantages/var": 0.3826236462151087, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 3.7197132616487454, "grad_norm": 0.10167525036542162, "learning_rate": 1.1145198450805945e-06, "loss": 0.0, "num_tokens": 107715699.0, "reward": 0.8515625, "reward_std": 0.1830747127532959, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1295 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.967079601050182e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.7225806451612904, "grad_norm": 0.17046357996494574, "learning_rate": 1.1134019633668396e-06, "loss": 0.0, "num_tokens": 107788985.0, "reward": 0.890625, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1296 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917366402546925e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.725448028673835, "grad_norm": 0.08946783404091041, "learning_rate": 1.1122839380745737e-06, "loss": 0.0, "num_tokens": 107871812.0, "reward": 0.7421875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1297 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.3361080419982039e-08, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 3.72831541218638, "grad_norm": 0.09614758200353986, "learning_rate": 1.1111657706193312e-06, "loss": 0.0, "num_tokens": 107947065.0, "reward": 0.875, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1298 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.672108843924502e-09, "advantages/std": 0.5228019952774048, "advantages/var": 0.2733219262660356, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 3.731182795698925, "grad_norm": 0.1333434622161509, "learning_rate": 1.1100474624168268e-06, "loss": -0.0, "num_tokens": 108035921.0, "reward": 0.7421875, "reward_std": 0.14807432889938354, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1299 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.7340501792114695, "grad_norm": 0.03215331874989343, "learning_rate": 1.1089290148829536e-06, "loss": 0.0, "num_tokens": 108113233.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1300 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.967257841033182e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.736917562724014, "grad_norm": 0.09556214212903666, "learning_rate": 1.1078104294337804e-06, "loss": 0.0, "num_tokens": 108201219.0, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 1301 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983400669593257e-09, "advantages/std": 0.4676017463207245, "advantages/var": 0.21865139316219118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.739784946236559, "grad_norm": 0.09208740991565705, "learning_rate": 1.1066917074855515e-06, "loss": -0.0, "num_tokens": 108271266.0, "reward": 0.9296875, "reward_std": 0.11336850374937057, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1302 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344329800322181e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.742652329749104, "grad_norm": 0.13102744513708434, "learning_rate": 1.1055728504546833e-06, "loss": 0.0, "num_tokens": 108349729.0, "reward": 0.8515625, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1303 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983450684521008e-09, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 3.7455197132616487, "grad_norm": 0.10207573847644788, "learning_rate": 1.1044538597577637e-06, "loss": 0.0, "num_tokens": 108444578.0, "reward": 0.796875, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1304 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.691872442631884e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.7483870967741937, "grad_norm": 0.10111644770835693, "learning_rate": 1.1033347368115494e-06, "loss": -0.0, "num_tokens": 108524385.0, "reward": 0.859375, "reward_std": 0.15650182962417603, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1305 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1498692618242365e-08, "advantages/std": 0.40496888756752014, "advantages/var": 0.16399979989767477, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.7512544802867382, "grad_norm": 0.1064704614620527, "learning_rate": 1.1022154830329648e-06, "loss": 0.0, "num_tokens": 108616264.0, "reward": 0.671875, "reward_std": 0.09810129553079605, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 1306 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.7541218637992833, "grad_norm": 0.09424157843515399, "learning_rate": 1.1010960998391001e-06, "loss": -0.0, "num_tokens": 108703232.0, "reward": 0.65625, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 1307 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.756989247311828, "grad_norm": 0.06870232361927221, "learning_rate": 1.099976588647209e-06, "loss": 0.0, "num_tokens": 108768091.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1308 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9835626514248234e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.759856630824373, "grad_norm": 0.13151750578685634, "learning_rate": 1.0988569508747075e-06, "loss": -0.0, "num_tokens": 108847049.0, "reward": 0.8203125, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1309 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.762724014336918, "grad_norm": 0.05392749330554253, "learning_rate": 1.0977371879391721e-06, "loss": 0.0, "num_tokens": 108917659.0, "reward": 0.828125, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1310 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.618579626083374, "advantages/var": 0.3826407538054468, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.7655913978494624, "grad_norm": 0.12948372587088572, "learning_rate": 1.0966173012583366e-06, "loss": 0.0, "num_tokens": 109013921.0, "reward": 0.734375, "reward_std": 0.2001592218875885, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1311 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1499234216592094e-08, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.768458781362007, "grad_norm": 0.06999483842741774, "learning_rate": 1.0954972922500935e-06, "loss": -0.0, "num_tokens": 109095568.0, "reward": 0.9609375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 1312 }, { "advantages/mean": 6.984919309616089e-09, "advantages/snr": 1.336040712984824e-08, "advantages/std": 0.5228073596954346, "advantages/var": 0.2733275353517115, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.771326164874552, "grad_norm": 0.11859180878926806, "learning_rate": 1.0943771623324882e-06, "loss": -0.0, "num_tokens": 109174541.0, "reward": 0.734375, "reward_std": 0.15596505999565125, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1313 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.774193548387097, "grad_norm": 0.03892614703730164, "learning_rate": 1.0932569129237205e-06, "loss": 0.0, "num_tokens": 109256372.0, "reward": 0.9140625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1314 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.7770609318996415, "grad_norm": 0.11123047373003779, "learning_rate": 1.0921365454421402e-06, "loss": 0.0, "num_tokens": 109339020.0, "reward": 0.9140625, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1315 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.504997077293582e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 3.7799283154121865, "grad_norm": 0.12202431275931674, "learning_rate": 1.0910160613062487e-06, "loss": -0.0, "num_tokens": 109426843.0, "reward": 0.796875, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1316 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149914704966296e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.782795698924731, "grad_norm": 0.10100155038741537, "learning_rate": 1.0898954619346923e-06, "loss": 0.0, "num_tokens": 109509983.0, "reward": 0.6875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 1317 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907665222004876e-09, "advantages/std": 0.5227646827697754, "advantages/var": 0.2732829135513839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.785663082437276, "grad_norm": 0.10900857756454012, "learning_rate": 1.088774748746266e-06, "loss": 0.0, "num_tokens": 109585662.0, "reward": 0.9609375, "reward_std": 0.11048543453216553, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 1318 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.7885304659498207, "grad_norm": 0.05121812077394973, "learning_rate": 1.0876539231599067e-06, "loss": 0.0, "num_tokens": 109663710.0, "reward": 0.953125, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 1319 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.011193400505165e-09, "advantages/std": 0.6185737252235413, "advantages/var": 0.3826334535369291, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.7913978494623657, "grad_norm": 0.13595044286737032, "learning_rate": 1.0865329865946945e-06, "loss": 0.0, "num_tokens": 109744234.0, "reward": 0.8671875, "reward_std": 0.19332444667816162, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1320 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.4083577551179745e-09, "advantages/std": 0.6612826585769653, "advantages/var": 0.4372947545346193, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.7942652329749103, "grad_norm": 0.15701328467704398, "learning_rate": 1.085411940469851e-06, "loss": 0.0, "num_tokens": 109825414.0, "reward": 0.8125, "reward_std": 0.21884137392044067, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1321 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516833624148468e-09, "advantages/std": 0.6185677647590637, "advantages/var": 0.3826260795990244, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.7971326164874553, "grad_norm": 0.11017406656209758, "learning_rate": 1.0842907862047342e-06, "loss": -0.0, "num_tokens": 109916589.0, "reward": 0.671875, "reward_std": 0.18648964166641235, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 1322 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016494700483271e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.8, "grad_norm": 0.14760198313667855, "learning_rate": 1.0831695252188413e-06, "loss": 0.0, "num_tokens": 109994855.0, "reward": 0.7421875, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1323 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917813257124117e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.802867383512545, "grad_norm": 0.1477807109136703, "learning_rate": 1.0820481589318031e-06, "loss": 0.0, "num_tokens": 110077011.0, "reward": 0.8984375, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1324 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 1.126665165824288e-08, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.80573476702509, "grad_norm": 0.10275246143222694, "learning_rate": 1.0809266887633848e-06, "loss": 0.0, "num_tokens": 110161437.0, "reward": 0.78125, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1325 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899540529955257e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.8086021505376344, "grad_norm": 0.09403150692646986, "learning_rate": 1.0798051161334817e-06, "loss": -0.0, "num_tokens": 110243698.0, "reward": 0.7890625, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1326 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.811469534050179, "grad_norm": 0.1064862118449541, "learning_rate": 1.0786834424621209e-06, "loss": 0.0, "num_tokens": 110311106.0, "reward": 0.8828125, "reward_std": 0.12415502220392227, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1327 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.4536081669351505e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 3.814336917562724, "grad_norm": 0.10207951572882269, "learning_rate": 1.0775616691694553e-06, "loss": 0.0, "num_tokens": 110393058.0, "reward": 0.7578125, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1328 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.749445740229558e-09, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 3.817204301075269, "grad_norm": 0.13002847818344113, "learning_rate": 1.0764397976757655e-06, "loss": 0.0, "num_tokens": 110474665.0, "reward": 0.7109375, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 1329 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.979341600636731e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.8200716845878135, "grad_norm": 0.09621453531039598, "learning_rate": 1.0753178294014556e-06, "loss": 0.0, "num_tokens": 110555789.0, "reward": 0.9140625, "reward_std": 0.1054728701710701, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1330 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.8229390681003586, "grad_norm": 0.08820674061444314, "learning_rate": 1.074195765767052e-06, "loss": 0.0, "num_tokens": 110641561.0, "reward": 0.78125, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1331 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.4536934733273465e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.825806451612903, "grad_norm": 0.10667133981075559, "learning_rate": 1.073073608193203e-06, "loss": 0.0, "num_tokens": 110729635.0, "reward": 0.640625, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.481702595949173, "step": 1332 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.828673835125448, "grad_norm": 0.12307235433211015, "learning_rate": 1.071951358100675e-06, "loss": 0.0, "num_tokens": 110806669.0, "reward": 0.8203125, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1333 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.8315412186379927, "grad_norm": 0.06152352254396921, "learning_rate": 1.0708290169103514e-06, "loss": -0.0, "num_tokens": 110883099.0, "reward": 0.8046875, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1334 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.0349232344696665e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.8344086021505377, "grad_norm": 0.07339730279981847, "learning_rate": 1.0697065860432314e-06, "loss": 0.0, "num_tokens": 110964489.0, "reward": 0.890625, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1335 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983562397524497e-09, "advantages/std": 0.4675827622413635, "advantages/var": 0.2186336395452635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.8372759856630827, "grad_norm": 0.15370363082688193, "learning_rate": 1.0685840669204271e-06, "loss": 0.0, "num_tokens": 111043042.0, "reward": 0.8203125, "reward_std": 0.09522313624620438, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1336 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814513910737996e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.8401433691756273, "grad_norm": 0.09558394208295336, "learning_rate": 1.0674614609631634e-06, "loss": -0.0, "num_tokens": 111123683.0, "reward": 0.828125, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1337 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.843010752688172, "grad_norm": 0.12520949146235022, "learning_rate": 1.0663387695927742e-06, "loss": -0.0, "num_tokens": 111201382.0, "reward": 0.7578125, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1338 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.1579757501173404e-08, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.845878136200717, "grad_norm": 0.17554273424862962, "learning_rate": 1.065215994230702e-06, "loss": 0.0, "num_tokens": 111273465.0, "reward": 0.9453125, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1339 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.9876049603820392e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.848745519713262, "grad_norm": 0.06860011960572143, "learning_rate": 1.0640931362984955e-06, "loss": -0.0, "num_tokens": 111353193.0, "reward": 0.8671875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1340 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.131077090192316e-09, "advantages/std": 0.5726932287216187, "advantages/var": 0.3279775342235922, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.8516129032258064, "grad_norm": 0.11329417129773361, "learning_rate": 1.0629701972178078e-06, "loss": 0.0, "num_tokens": 111436228.0, "reward": 0.7578125, "reward_std": 0.172288179397583, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1341 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.98750518535691e-09, "advantages/std": 0.46760883927345276, "advantages/var": 0.21865802656666578, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.8544802867383514, "grad_norm": 0.09480214205734269, "learning_rate": 1.061847178410395e-06, "loss": -0.0, "num_tokens": 111524936.0, "reward": 0.8828125, "reward_std": 0.11914245784282684, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1342 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.29975138607539e-09, "advantages/std": 0.4049666225910187, "advantages/var": 0.16399796541277656, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.857347670250896, "grad_norm": 0.13440261245254936, "learning_rate": 1.0607240812981144e-06, "loss": -0.0, "num_tokens": 111600908.0, "reward": 0.921875, "reward_std": 0.09574718773365021, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1343 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.4948445614343708e-08, "advantages/std": 0.40496495366096497, "advantages/var": 0.1639966136936275, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.860215053763441, "grad_norm": 0.08384584122117245, "learning_rate": 1.059600907302921e-06, "loss": 0.0, "num_tokens": 111687198.0, "reward": 0.6875, "reward_std": 0.0936255231499672, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 1344 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 7.966859224177393e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.8630824372759855, "grad_norm": 0.14578138902811386, "learning_rate": 1.0584776578468697e-06, "loss": 0.0, "num_tokens": 111761184.0, "reward": 0.6171875, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.4879830479621887, "step": 1345 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899421713267256e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.8659498207885306, "grad_norm": 0.09092611235150355, "learning_rate": 1.0573543343521082e-06, "loss": 0.0, "num_tokens": 111839818.0, "reward": 0.9375, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1346 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9833176526633854e-09, "advantages/std": 0.46761149168014526, "advantages/var": 0.21866050715133056, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 3.868817204301075, "grad_norm": 0.08858353776566967, "learning_rate": 1.0562309382408798e-06, "loss": -0.0, "num_tokens": 111924067.0, "reward": 0.875, "reward_std": 0.12255740165710449, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1347 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 7.041931588438948e-09, "advantages/std": 0.6612692475318909, "advantages/var": 0.43727701773139316, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.87168458781362, "grad_norm": 0.1840637119804573, "learning_rate": 1.055107470935519e-06, "loss": 0.0, "num_tokens": 112010302.0, "reward": 0.78125, "reward_std": 0.20069600641727448, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1348 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6612985134124756, "advantages/var": 0.43731572384155015, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.8745519713261647, "grad_norm": 0.1370269768902454, "learning_rate": 1.0539839338584508e-06, "loss": 0.0, "num_tokens": 112108864.0, "reward": 0.6875, "reward_std": 0.24146251380443573, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 1349 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.8774193548387097, "grad_norm": 0.04320776006081769, "learning_rate": 1.0528603284321878e-06, "loss": 0.0, "num_tokens": 112185742.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1350 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.3009750424828314e-08, "advantages/std": 0.5726920366287231, "advantages/var": 0.32797616881795477, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 3.8802867383512547, "grad_norm": 0.1255881114221988, "learning_rate": 1.0517366560793304e-06, "loss": -0.0, "num_tokens": 112270299.0, "reward": 0.7890625, "reward_std": 0.1701665222644806, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1351 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983384167481491e-09, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.8831541218637993, "grad_norm": 0.16555960495596858, "learning_rate": 1.0506129182225626e-06, "loss": 0.0, "num_tokens": 112351127.0, "reward": 0.8359375, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1352 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599694025151775e-09, "advantages/std": 0.40494978427886963, "advantages/var": 0.16398432778750305, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.886021505376344, "grad_norm": 0.08557565482351023, "learning_rate": 1.0494891162846513e-06, "loss": 0.0, "num_tokens": 112424515.0, "reward": 0.8203125, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1353 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 3.888888888888889, "grad_norm": 0.09457011403274432, "learning_rate": 1.0483652516884458e-06, "loss": 0.0, "num_tokens": 112498953.0, "reward": 0.8515625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1354 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962814881146116e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.891756272401434, "grad_norm": 0.1745861528850439, "learning_rate": 1.0472413258568733e-06, "loss": -0.0, "num_tokens": 112571392.0, "reward": 0.8984375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1355 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.8946236559139784, "grad_norm": 0.07075151530037413, "learning_rate": 1.0461173402129393e-06, "loss": -0.0, "num_tokens": 112650111.0, "reward": 0.9375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1356 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.8974910394265234, "grad_norm": 0.1079626188074426, "learning_rate": 1.0449932961797247e-06, "loss": 0.0, "num_tokens": 112715656.0, "reward": 0.890625, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1357 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.703125, "epoch": 3.900358422939068, "grad_norm": 0.07701497381093976, "learning_rate": 1.0438691951803848e-06, "loss": -0.0, "num_tokens": 112802452.0, "reward": 0.78125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1358 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 3.903225806451613, "grad_norm": 0.06111207006774216, "learning_rate": 1.0427450386381462e-06, "loss": -0.0, "num_tokens": 112885542.0, "reward": 0.734375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1359 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.9060931899641576, "grad_norm": 0.07875273671794938, "learning_rate": 1.0416208279763073e-06, "loss": 0.0, "num_tokens": 112961615.0, "reward": 0.7890625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1360 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.9089605734767026, "grad_norm": 0.08032362493131201, "learning_rate": 1.0404965646182329e-06, "loss": -0.0, "num_tokens": 113046939.0, "reward": 0.7109375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 1361 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962814881146116e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.9118279569892476, "grad_norm": 0.1297952360056973, "learning_rate": 1.0393722499873562e-06, "loss": 0.0, "num_tokens": 113134836.0, "reward": 0.6953125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 1362 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.235142426779239e-09, "advantages/std": 0.5227833986282349, "advantages/var": 0.2733024818812879, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.914695340501792, "grad_norm": 0.15569561366482432, "learning_rate": 1.038247885507175e-06, "loss": 0.0, "num_tokens": 113226761.0, "reward": 0.8359375, "reward_std": 0.13098490238189697, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1363 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.379887186086637e-08, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.9175627240143367, "grad_norm": 0.11041596653421341, "learning_rate": 1.0371234726012496e-06, "loss": 0.0, "num_tokens": 113294239.0, "reward": 0.9453125, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1364 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.9204301075268817, "grad_norm": 0.05127564834307841, "learning_rate": 1.0359990126932022e-06, "loss": 0.0, "num_tokens": 113376874.0, "reward": 0.8203125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1365 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.9232974910394267, "grad_norm": 0.09498261488724016, "learning_rate": 1.0348745072067141e-06, "loss": 0.0, "num_tokens": 113455819.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1366 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599795920412534e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 3.9261648745519713, "grad_norm": 0.0646405995256052, "learning_rate": 1.033749957565525e-06, "loss": 0.0, "num_tokens": 113543974.0, "reward": 0.859375, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1367 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.9290322580645163, "grad_norm": 0.1435436920010314, "learning_rate": 1.0326253651934294e-06, "loss": 0.0, "num_tokens": 113609869.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1368 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016351208262037e-09, "advantages/std": 0.5228003859519958, "advantages/var": 0.2733202435515558, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.931899641577061, "grad_norm": 0.1259963606265134, "learning_rate": 1.031500731514277e-06, "loss": 0.0, "num_tokens": 113704248.0, "reward": 0.7109375, "reward_std": 0.14913026988506317, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 1369 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450174584180815e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.934767025089606, "grad_norm": 0.05836523933563015, "learning_rate": 1.030376057951969e-06, "loss": -0.0, "num_tokens": 113774762.0, "reward": 0.84375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1370 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 3.9376344086021504, "grad_norm": 0.04136085261347344, "learning_rate": 1.029251345930458e-06, "loss": -0.0, "num_tokens": 113853475.0, "reward": 0.9140625, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1371 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.041743076701556e-10, "advantages/std": 0.6612869501113892, "advantages/var": 0.4373004303876229, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.9405017921146954, "grad_norm": 0.2235437844824043, "learning_rate": 1.028126596873744e-06, "loss": 0.0, "num_tokens": 113945853.0, "reward": 0.7109375, "reward_std": 0.22673210501670837, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 1372 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.94336917562724, "grad_norm": 0.06286535968371615, "learning_rate": 1.0270018122058753e-06, "loss": 0.0, "num_tokens": 114013504.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1373 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262188031035393e-09, "advantages/std": 0.5726920366287231, "advantages/var": 0.32797616881795477, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 3.946236559139785, "grad_norm": 0.11232914675015847, "learning_rate": 1.0258769933509438e-06, "loss": 0.0, "num_tokens": 114101359.0, "reward": 0.8046875, "reward_std": 0.1701665222644806, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1374 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.9491039426523296, "grad_norm": 0.11737911669960818, "learning_rate": 1.0247521417330863e-06, "loss": 0.0, "num_tokens": 114176210.0, "reward": 0.7890625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1375 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975101004389886e-09, "advantages/std": 0.4676017463207245, "advantages/var": 0.21865139316219118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.9519713261648746, "grad_norm": 0.0874895004538831, "learning_rate": 1.0236272587764798e-06, "loss": -0.0, "num_tokens": 114244963.0, "reward": 0.8203125, "reward_std": 0.11336849629878998, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1376 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983629174425397e-09, "advantages/std": 0.46757492423057556, "advantages/var": 0.21862630976922848, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.9548387096774196, "grad_norm": 0.11009512079189243, "learning_rate": 1.0225023459053415e-06, "loss": 0.0, "num_tokens": 114330630.0, "reward": 0.734375, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1377 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262492693233955e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.957706093189964, "grad_norm": 0.1347477292006464, "learning_rate": 1.0213774045439265e-06, "loss": -0.0, "num_tokens": 114409319.0, "reward": 0.75, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1378 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.9605734767025087, "grad_norm": 0.10334727924039125, "learning_rate": 1.0202524361165255e-06, "loss": 0.0, "num_tokens": 114481299.0, "reward": 0.8125, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1379 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.9634408602150537, "grad_norm": 0.09093757904017581, "learning_rate": 1.0191274420474647e-06, "loss": 0.0, "num_tokens": 114560714.0, "reward": 0.8515625, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1380 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.9663082437275987, "grad_norm": 0.08678117670580014, "learning_rate": 1.0180024237611009e-06, "loss": 0.0, "num_tokens": 114632344.0, "reward": 0.9140625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1381 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 3.9691756272401433, "grad_norm": 0.07109617726796713, "learning_rate": 1.0168773826818235e-06, "loss": -0.0, "num_tokens": 114704697.0, "reward": 0.8125, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1382 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.9720430107526883, "grad_norm": 0.07477735095983636, "learning_rate": 1.015752320234049e-06, "loss": -0.0, "num_tokens": 114780505.0, "reward": 0.9375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1383 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.131056779894869e-09, "advantages/std": 0.5726946592330933, "advantages/var": 0.3279791727141088, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 3.974910394265233, "grad_norm": 0.10464202287838072, "learning_rate": 1.0146272378422227e-06, "loss": -0.0, "num_tokens": 114878778.0, "reward": 0.7734375, "reward_std": 0.17123225331306458, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1384 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.977777777777778, "grad_norm": 0.08964071801741949, "learning_rate": 1.0135021369308136e-06, "loss": 0.0, "num_tokens": 114944000.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 1385 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.9806451612903224, "grad_norm": 0.029530648555468466, "learning_rate": 1.0123770189243149e-06, "loss": -0.0, "num_tokens": 115012550.0, "reward": 0.921875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1386 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.672120115913595e-09, "advantages/std": 0.5227997899055481, "advantages/var": 0.27331962032528523, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 3.9835125448028674, "grad_norm": 0.13306170651545898, "learning_rate": 1.0112518852472413e-06, "loss": -0.0, "num_tokens": 115102429.0, "reward": 0.828125, "reward_std": 0.14806944131851196, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1387 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.4082993428404723e-08, "advantages/std": 0.330655038356781, "advantages/var": 0.10933275439072432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 3.9863799283154124, "grad_norm": 0.050078905854352956, "learning_rate": 1.0101267373241277e-06, "loss": 0.0, "num_tokens": 115166656.0, "reward": 0.890625, "reward_std": 0.0646936446428299, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1388 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.989247311827957, "grad_norm": 0.05174973349172179, "learning_rate": 1.0090015765795264e-06, "loss": -0.0, "num_tokens": 115240570.0, "reward": 0.828125, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1389 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.5995914574054795e-09, "advantages/std": 0.4049588143825531, "advantages/var": 0.1639916413461231, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 3.9921146953405016, "grad_norm": 0.09582298486917787, "learning_rate": 1.0078764044380063e-06, "loss": -0.0, "num_tokens": 115319665.0, "reward": 0.921875, "reward_std": 0.0867956355214119, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1390 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 3.9949820788530466, "grad_norm": 0.08362897906725004, "learning_rate": 1.0067512223241507e-06, "loss": 0.0, "num_tokens": 115389212.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1391 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599796258942519e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 3.9978494623655916, "grad_norm": 0.08647927186288491, "learning_rate": 1.0056260316625558e-06, "loss": -0.0, "num_tokens": 115469012.0, "reward": 0.796875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1392 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.002867383512545, "grad_norm": 0.05710977983237636, "learning_rate": 1.0045008338778277e-06, "loss": 0.0, "num_tokens": 115555267.0, "reward": 0.8046875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1393 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917003347966285e-09, "advantages/std": 0.4676017463207245, "advantages/var": 0.21865139316219118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.00573476702509, "grad_norm": 0.1891657059044791, "learning_rate": 1.0033756303945828e-06, "loss": -0.0, "num_tokens": 115633653.0, "reward": 0.7421875, "reward_std": 0.11336849629878998, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1394 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907227504745508e-10, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.008602150537635, "grad_norm": 0.14291200048617728, "learning_rate": 1.0022504226374438e-06, "loss": 0.0, "num_tokens": 115716623.0, "reward": 0.71875, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 1395 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 8.450174584180815e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.011469534050179, "grad_norm": 0.04918178407942894, "learning_rate": 1.0011252120310387e-06, "loss": -0.0, "num_tokens": 115801496.0, "reward": 0.84375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1396 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.691702426092771e-09, "advantages/std": 0.5726984143257141, "advantages/var": 0.3279834737711873, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.014336917562724, "grad_norm": 0.1371515913326463, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 115884411.0, "reward": 0.875, "reward_std": 0.17700131237506866, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1397 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.970961834751672e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.017204301075269, "grad_norm": 0.09653238089827981, "learning_rate": 9.988747879689612e-07, "loss": 0.0, "num_tokens": 115964895.0, "reward": 0.78125, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1398 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.2470398597593402e-08, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.020071684587814, "grad_norm": 0.11479796353253469, "learning_rate": 9.97749577362556e-07, "loss": 0.0, "num_tokens": 116046692.0, "reward": 0.9453125, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1399 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.022939068100358, "grad_norm": 0.065055089170235, "learning_rate": 9.966243696054175e-07, "loss": 0.0, "num_tokens": 116123982.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1400 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.453528449034464e-09, "advantages/std": 0.5228003859519958, "advantages/var": 0.2733202435515558, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.025806451612903, "grad_norm": 0.132510357212156, "learning_rate": 9.954991661221722e-07, "loss": 0.0, "num_tokens": 116212703.0, "reward": 0.8203125, "reward_std": 0.14913025498390198, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1401 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.041577316723057e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.028673835125448, "grad_norm": 4.972098368851917, "learning_rate": 9.943739683374443e-07, "loss": -0.0, "num_tokens": 116286447.0, "reward": 0.8359375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1402 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.031541218637993, "grad_norm": 0.07543600126473533, "learning_rate": 9.93248777675849e-07, "loss": 0.0, "num_tokens": 116359830.0, "reward": 0.9765625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9765625, "rewards/drgrpo_math_reward/std": 0.15188287198543549, "step": 1403 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.2470118506643711e-08, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 4.034408602150537, "grad_norm": 0.09332703119373481, "learning_rate": 9.921235955619938e-07, "loss": 0.0, "num_tokens": 116446264.0, "reward": 0.75, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1404 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.9876049603820392e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.037275985663083, "grad_norm": 0.08579305490742774, "learning_rate": 9.909984234204737e-07, "loss": -0.0, "num_tokens": 116526150.0, "reward": 0.7421875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1405 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.040143369175627, "grad_norm": 0.08498597057038827, "learning_rate": 9.898732626758724e-07, "loss": 0.0, "num_tokens": 116594493.0, "reward": 0.9921875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9921875, "rewards/drgrpo_math_reward/std": 0.0883883461356163, "step": 1406 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.4536335554007825e-09, "advantages/std": 0.5227880477905273, "advantages/var": 0.2733073429126307, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.043010752688172, "grad_norm": 0.09641363420996596, "learning_rate": 9.887481147527586e-07, "loss": -0.0, "num_tokens": 116675425.0, "reward": 0.7421875, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1407 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6612716317176819, "advantages/var": 0.4372801709145655, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.045878136200717, "grad_norm": 0.17621604071730299, "learning_rate": 9.876229810756855e-07, "loss": -0.0, "num_tokens": 116764230.0, "reward": 0.6875, "reward_std": 0.20517177879810333, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 1408 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.048745519713262, "grad_norm": 0.06353852467813742, "learning_rate": 9.864978630691865e-07, "loss": 0.0, "num_tokens": 116827715.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1409 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.051612903225807, "grad_norm": 0.11046684032249461, "learning_rate": 9.853727621577773e-07, "loss": 0.0, "num_tokens": 116907743.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 1410 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9876548503938182e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.054480286738351, "grad_norm": 0.11414665198528622, "learning_rate": 9.842476797659508e-07, "loss": 0.0, "num_tokens": 116972996.0, "reward": 0.890625, "reward_std": 0.09863808006048203, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1411 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.23513460695797e-09, "advantages/std": 0.5227840542793274, "advantages/var": 0.27330316740873073, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.057347670250896, "grad_norm": 0.13426543371669594, "learning_rate": 9.831226173181769e-07, "loss": 0.0, "num_tokens": 117056953.0, "reward": 0.796875, "reward_std": 0.13204574584960938, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1412 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.060215053763441, "grad_norm": 0.07070571532585718, "learning_rate": 9.819975762388993e-07, "loss": 0.0, "num_tokens": 117123155.0, "reward": 0.96875, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1413 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299811976811062e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.063082437275986, "grad_norm": 0.08264438903728014, "learning_rate": 9.808725579525354e-07, "loss": -0.0, "num_tokens": 117210931.0, "reward": 0.8046875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1414 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.06594982078853, "grad_norm": 0.040465076588479504, "learning_rate": 9.797475638834744e-07, "loss": 0.0, "num_tokens": 117286212.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1415 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907307732763402e-09, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.068817204301075, "grad_norm": 0.14344840187422328, "learning_rate": 9.786225954560738e-07, "loss": 0.0, "num_tokens": 117368830.0, "reward": 0.875, "reward_std": 0.13098981976509094, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1416 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.626265687704098e-09, "advantages/std": 0.5726755261421204, "advantages/var": 0.3279572582421544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.07168458781362, "grad_norm": 0.18072254333567256, "learning_rate": 9.774976540946586e-07, "loss": 0.0, "num_tokens": 117453632.0, "reward": 0.765625, "reward_std": 0.15072788298130035, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1417 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1499234216592094e-08, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.074551971326165, "grad_norm": 0.08223549807688292, "learning_rate": 9.763727412235201e-07, "loss": 0.0, "num_tokens": 117529795.0, "reward": 0.9609375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 1418 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.077419354838709, "grad_norm": 0.027505824653321093, "learning_rate": 9.752478582669136e-07, "loss": 0.0, "num_tokens": 117595078.0, "reward": 0.8515625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1419 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983473280509385e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.080286738351255, "grad_norm": 0.09363819824185526, "learning_rate": 9.74123006649056e-07, "loss": -0.0, "num_tokens": 117682052.0, "reward": 0.8359375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1420 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.0831541218637994, "grad_norm": 0.07481291580787158, "learning_rate": 9.729981877941249e-07, "loss": -0.0, "num_tokens": 117759506.0, "reward": 0.6875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 1421 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.4083154633446115e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.086021505376344, "grad_norm": 0.06013183556706222, "learning_rate": 9.71873403126256e-07, "loss": -0.0, "num_tokens": 117828711.0, "reward": 0.8828125, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1422 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814751550759118e-09, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.088888888888889, "grad_norm": 0.09458985029231685, "learning_rate": 9.707486540695418e-07, "loss": 0.0, "num_tokens": 117908229.0, "reward": 0.8359375, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1423 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2524951534513563e-09, "advantages/std": 0.5726819038391113, "advantages/var": 0.32796456298478915, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.091756272401434, "grad_norm": 0.17279198048030153, "learning_rate": 9.69623942048031e-07, "loss": 0.0, "num_tokens": 117996385.0, "reward": 0.9140625, "reward_std": 0.15756267309188843, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1424 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.4083154633446115e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.094623655913979, "grad_norm": 0.06020982212956875, "learning_rate": 9.68499268485723e-07, "loss": -0.0, "num_tokens": 118081885.0, "reward": 0.8828125, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1425 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.5628597236829876e-09, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.097491039426523, "grad_norm": 0.12405911654579116, "learning_rate": 9.673746348065708e-07, "loss": 0.0, "num_tokens": 118157017.0, "reward": 0.875, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1426 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 7.3180584939108565e-09, "advantages/std": 0.5726862549781799, "advantages/var": 0.3279695466409329, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.100358422939068, "grad_norm": 0.09539003259588408, "learning_rate": 9.66250042434475e-07, "loss": -0.0, "num_tokens": 118241083.0, "reward": 0.6953125, "reward_std": 0.16439256072044373, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 1427 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299811976811062e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.103225806451613, "grad_norm": 0.14925567107369417, "learning_rate": 9.651254927932862e-07, "loss": -0.0, "num_tokens": 118320584.0, "reward": 0.8671875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1428 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.562997839424082e-09, "advantages/std": 0.5227746963500977, "advantages/var": 0.2732933831439368, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.106093189964158, "grad_norm": 0.08122134763980224, "learning_rate": 9.64000987306798e-07, "loss": 0.0, "num_tokens": 118415711.0, "reward": 0.828125, "reward_std": 0.12179600447416306, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1429 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.108960573476702, "grad_norm": 0.058516452906352656, "learning_rate": 9.628765273987505e-07, "loss": 0.0, "num_tokens": 118501690.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1430 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814432667740602e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.111827956989248, "grad_norm": 0.15317697617053813, "learning_rate": 9.61752114492825e-07, "loss": -0.0, "num_tokens": 118574370.0, "reward": 0.8984375, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1431 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983473280509385e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.114695340501792, "grad_norm": 0.13592734930576464, "learning_rate": 9.60627750012644e-07, "loss": 0.0, "num_tokens": 118656650.0, "reward": 0.8515625, "reward_std": 0.1054728701710701, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1432 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.117562724014337, "grad_norm": 0.10393644269771243, "learning_rate": 9.595034353817672e-07, "loss": 0.0, "num_tokens": 118739205.0, "reward": 0.921875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1433 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.130986540898401e-09, "advantages/std": 0.5726996064186096, "advantages/var": 0.32798483919203036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.120430107526881, "grad_norm": 0.18049214695560123, "learning_rate": 9.583791720236928e-07, "loss": 0.0, "num_tokens": 118814930.0, "reward": 0.84375, "reward_std": 0.17912296950817108, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1434 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.123297491039427, "grad_norm": 0.058660517467965995, "learning_rate": 9.572549613618537e-07, "loss": -0.0, "num_tokens": 118888437.0, "reward": 0.859375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1435 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975343977137235e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.1261648745519715, "grad_norm": 0.08115979157730528, "learning_rate": 9.56130804819615e-07, "loss": 0.0, "num_tokens": 118972605.0, "reward": 0.7890625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1436 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.129032258064516, "grad_norm": 0.09111951306603897, "learning_rate": 9.550067038202754e-07, "loss": -0.0, "num_tokens": 119066369.0, "reward": 0.8125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1437 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599796258942519e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.131899641577061, "grad_norm": 0.07689670459835662, "learning_rate": 9.538826597870609e-07, "loss": 0.0, "num_tokens": 119143072.0, "reward": 0.84375, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1438 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917813257124117e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.134767025089606, "grad_norm": 0.11387495882854284, "learning_rate": 9.527586741431268e-07, "loss": 0.0, "num_tokens": 119217315.0, "reward": 0.8203125, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1439 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983467187183905e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.137634408602151, "grad_norm": 0.13494130220291506, "learning_rate": 9.516347483115544e-07, "loss": -0.0, "num_tokens": 119300412.0, "reward": 0.796875, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1440 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.0954734451444e-08, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 4.140501792114695, "grad_norm": 0.11342136827484776, "learning_rate": 9.505108837153488e-07, "loss": 0.0, "num_tokens": 119385911.0, "reward": 0.765625, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1441 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125995678848164e-09, "advantages/std": 0.5227746963500977, "advantages/var": 0.2732933831439368, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.14336917562724, "grad_norm": 0.1520353991334203, "learning_rate": 9.493870817774375e-07, "loss": 0.0, "num_tokens": 119461233.0, "reward": 0.875, "reward_std": 0.12179599702358246, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1442 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.146236559139785, "grad_norm": 0.0, "learning_rate": 9.482633439206695e-07, "loss": 0.0, "num_tokens": 119537972.0, "reward": 0.9375, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1443 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199591840825068e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.14910394265233, "grad_norm": 0.08421970535405401, "learning_rate": 9.47139671567812e-07, "loss": 0.0, "num_tokens": 119614186.0, "reward": 0.90625, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1444 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.8996420859237135e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.151971326164874, "grad_norm": 0.11243615414331748, "learning_rate": 9.460160661415495e-07, "loss": 0.0, "num_tokens": 119683159.0, "reward": 0.9609375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 1445 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.235170151758147e-09, "advantages/std": 0.5227810740470886, "advantages/var": 0.27330005138182756, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 4.15483870967742, "grad_norm": 0.10245184557169275, "learning_rate": 9.448925290644812e-07, "loss": 0.0, "num_tokens": 119768738.0, "reward": 0.78125, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1446 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.563018557708836e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.157706093189964, "grad_norm": 0.13998590359890162, "learning_rate": 9.437690617591202e-07, "loss": 0.0, "num_tokens": 119852881.0, "reward": 0.90625, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1447 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.160573476702509, "grad_norm": 0.0988656358265864, "learning_rate": 9.426456656478918e-07, "loss": 0.0, "num_tokens": 119930506.0, "reward": 0.71875, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 1448 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.8788016332274455e-09, "advantages/std": 0.5726749897003174, "advantages/var": 0.3279566438282586, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.163440860215053, "grad_norm": 0.19127430664342882, "learning_rate": 9.415223421531307e-07, "loss": 0.0, "num_tokens": 120018529.0, "reward": 0.8359375, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1449 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.166308243727599, "grad_norm": 0.13094005799328043, "learning_rate": 9.403990926970789e-07, "loss": -0.0, "num_tokens": 120094154.0, "reward": 0.953125, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 1450 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.781509278854418e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.1691756272401435, "grad_norm": 0.10072318247330422, "learning_rate": 9.392759187018857e-07, "loss": 0.0, "num_tokens": 120189757.0, "reward": 0.578125, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.578125, "rewards/drgrpo_math_reward/std": 0.4957992732524872, "step": 1451 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958501673983143e-09, "advantages/std": 0.4676017463207245, "advantages/var": 0.21865139316219118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.172043010752688, "grad_norm": 0.12826479513139036, "learning_rate": 9.381528215896048e-07, "loss": 0.0, "num_tokens": 120271028.0, "reward": 0.8671875, "reward_std": 0.11336849629878998, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1452 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.065668450848788e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.174910394265233, "grad_norm": 0.1440767155168118, "learning_rate": 9.370298027821922e-07, "loss": 0.0, "num_tokens": 120348702.0, "reward": 0.7890625, "reward_std": 0.14966703951358795, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1453 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917480653229804e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.177777777777778, "grad_norm": 0.09945515704015066, "learning_rate": 9.359068637015047e-07, "loss": 0.0, "num_tokens": 120423446.0, "reward": 0.84375, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1454 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907505770133387e-09, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.180645161290323, "grad_norm": 0.1040926400199926, "learning_rate": 9.34784005769298e-07, "loss": 0.0, "num_tokens": 120501820.0, "reward": 0.8203125, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1455 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.8996420859237135e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.183512544802867, "grad_norm": 0.08000849681171769, "learning_rate": 9.336612304072255e-07, "loss": 0.0, "num_tokens": 120569450.0, "reward": 0.9609375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 1456 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.504924634842639e-09, "advantages/std": 0.5726876854896545, "advantages/var": 0.3279711851114975, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.186379928315413, "grad_norm": 0.14289770413103803, "learning_rate": 9.325385390368366e-07, "loss": 0.0, "num_tokens": 120652604.0, "reward": 0.8515625, "reward_std": 0.1633366346359253, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1457 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.2649061754629257e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.189247311827957, "grad_norm": 0.09349152460241286, "learning_rate": 9.314159330795729e-07, "loss": 0.0, "num_tokens": 120738338.0, "reward": 0.828125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1458 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.192114695340502, "grad_norm": 0.04832040714860593, "learning_rate": 9.302934139567689e-07, "loss": 0.0, "num_tokens": 120825619.0, "reward": 0.7890625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1459 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983562397524497e-09, "advantages/std": 0.4675827622413635, "advantages/var": 0.2186336395452635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.194982078853046, "grad_norm": 0.09409055244967295, "learning_rate": 9.291709830896485e-07, "loss": 0.0, "num_tokens": 120902775.0, "reward": 0.8984375, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1460 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 7.126037115417672e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.197849462365592, "grad_norm": 0.17863877682661655, "learning_rate": 9.280486418993253e-07, "loss": 0.0, "num_tokens": 120988258.0, "reward": 0.921875, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1461 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.200716845878136, "grad_norm": 0.09966702242066987, "learning_rate": 9.269263918067969e-07, "loss": 0.0, "num_tokens": 121073667.0, "reward": 0.78125, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1462 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.203584229390681, "grad_norm": 0.0, "learning_rate": 9.258042342329479e-07, "loss": 0.0, "num_tokens": 121145750.0, "reward": 0.875, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1463 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.4497441148988883e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.2064516129032254, "grad_norm": 0.08595277767135193, "learning_rate": 9.246821705985446e-07, "loss": -0.0, "num_tokens": 121223289.0, "reward": 0.828125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1464 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983539800525091e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.209318996415771, "grad_norm": 0.10545558084009203, "learning_rate": 9.235602023242348e-07, "loss": -0.0, "num_tokens": 121300497.0, "reward": 0.78125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1465 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966859224177393e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.2121863799283155, "grad_norm": 0.1166465405118057, "learning_rate": 9.224383308305446e-07, "loss": 0.0, "num_tokens": 121385926.0, "reward": 0.8203125, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1466 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.749701315113695e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.21505376344086, "grad_norm": 0.12702730738938423, "learning_rate": 9.213165575378792e-07, "loss": 0.0, "num_tokens": 121458830.0, "reward": 0.8359375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1467 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983539546627677e-09, "advantages/std": 0.4675854444503784, "advantages/var": 0.21863614786185792, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.217921146953405, "grad_norm": 0.18718106551736144, "learning_rate": 9.201948838665182e-07, "loss": 0.0, "num_tokens": 121542658.0, "reward": 0.78125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1468 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998248400596406e-09, "advantages/std": 0.4049536883831024, "advantages/var": 0.16398748973507882, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 4.22078853046595, "grad_norm": 0.07568000791039003, "learning_rate": 9.190733112366156e-07, "loss": 0.0, "num_tokens": 121639487.0, "reward": 0.8203125, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1469 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9876548503938182e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.223655913978495, "grad_norm": 0.11969325026531555, "learning_rate": 9.17951841068197e-07, "loss": -0.0, "num_tokens": 121707906.0, "reward": 0.828125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1470 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299846843318419e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.226523297491039, "grad_norm": 0.07745000096218684, "learning_rate": 9.168304747811587e-07, "loss": 0.0, "num_tokens": 121778917.0, "reward": 0.9296875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1471 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 5.6336549553791566e-09, "advantages/std": 0.6612563729286194, "advantages/var": 0.43725999073871336, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.229390681003585, "grad_norm": 0.15047688601036804, "learning_rate": 9.157092137952656e-07, "loss": 0.0, "num_tokens": 121857601.0, "reward": 0.8671875, "reward_std": 0.18361148238182068, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1472 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.975067111642947e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 4.232258064516129, "grad_norm": 0.09297168256476258, "learning_rate": 9.145880595301493e-07, "loss": -0.0, "num_tokens": 121933925.0, "reward": 0.875, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1473 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917335935919526e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.235125448028674, "grad_norm": 0.10083035649293086, "learning_rate": 9.134670134053054e-07, "loss": 0.0, "num_tokens": 122011969.0, "reward": 0.875, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1474 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.237992831541218, "grad_norm": 0.12954753409390488, "learning_rate": 9.123460768400933e-07, "loss": 0.0, "num_tokens": 122081124.0, "reward": 0.8515625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1475 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.1292129760556737e-08, "advantages/std": 0.6185652613639832, "advantages/var": 0.3826229825662928, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.240860215053764, "grad_norm": 0.17851557977755236, "learning_rate": 9.112252512537341e-07, "loss": 0.0, "num_tokens": 122169256.0, "reward": 0.71875, "reward_std": 0.1820138692855835, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 1476 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599614475511504e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.243727598566308, "grad_norm": 0.06532798513154459, "learning_rate": 9.101045380653074e-07, "loss": 0.0, "num_tokens": 122251469.0, "reward": 0.875, "reward_std": 0.08785156905651093, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1477 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.246594982078853, "grad_norm": 0.09513417861363471, "learning_rate": 9.089839386937516e-07, "loss": 0.0, "num_tokens": 122332731.0, "reward": 0.96875, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1478 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.1950488391937882e-08, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.2494623655913975, "grad_norm": 0.17134657471060075, "learning_rate": 9.078634545578597e-07, "loss": 0.0, "num_tokens": 122405488.0, "reward": 0.953125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 1479 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.96693437436781e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 4.252329749103943, "grad_norm": 0.12659718453337696, "learning_rate": 9.067430870762795e-07, "loss": 0.0, "num_tokens": 122493552.0, "reward": 0.8125, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1480 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 4.2551971326164875, "grad_norm": 0.04849278320959078, "learning_rate": 9.056228376675117e-07, "loss": -0.0, "num_tokens": 122576239.0, "reward": 0.796875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1481 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.258064516129032, "grad_norm": 0.06914980251434022, "learning_rate": 9.045027077499066e-07, "loss": 0.0, "num_tokens": 122664171.0, "reward": 0.90625, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1482 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.2997782960918235e-09, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.260931899641577, "grad_norm": 0.07948991734477566, "learning_rate": 9.033826987416632e-07, "loss": -0.0, "num_tokens": 122752107.0, "reward": 0.7734375, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1483 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344456541825744e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.263799283154122, "grad_norm": 0.1428955902481616, "learning_rate": 9.02262812060828e-07, "loss": 0.0, "num_tokens": 122828380.0, "reward": 0.8046875, "reward_std": 0.12415502220392227, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1484 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.266666666666667, "grad_norm": 0.07377116415118819, "learning_rate": 9.011430491252923e-07, "loss": 0.0, "num_tokens": 122899534.0, "reward": 0.8984375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1485 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.269534050179211, "grad_norm": 0.0768415894619512, "learning_rate": 9.000234113527911e-07, "loss": -0.0, "num_tokens": 122974114.0, "reward": 0.875, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1486 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199317639730369e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.272401433691757, "grad_norm": 0.17149827069623058, "learning_rate": 8.989039001609e-07, "loss": 0.0, "num_tokens": 123047232.0, "reward": 0.828125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1487 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917480653229804e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.275268817204301, "grad_norm": 0.10217273379462666, "learning_rate": 8.977845169670352e-07, "loss": 0.0, "num_tokens": 123120772.0, "reward": 0.921875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1488 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299897960206267e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.278136200716846, "grad_norm": 0.10975235544047096, "learning_rate": 8.966652631884504e-07, "loss": -0.0, "num_tokens": 123197304.0, "reward": 0.875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1489 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.28100358422939, "grad_norm": 0.03228892691985338, "learning_rate": 8.955461402422364e-07, "loss": 0.0, "num_tokens": 123274259.0, "reward": 0.9921875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9921875, "rewards/drgrpo_math_reward/std": 0.0883883461356163, "step": 1490 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.9752007807758586e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.283870967741936, "grad_norm": 0.11509669919237757, "learning_rate": 8.944271495453166e-07, "loss": 0.0, "num_tokens": 123361614.0, "reward": 0.84375, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1491 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.28673835125448, "grad_norm": 0.07993581383478614, "learning_rate": 8.933082925144485e-07, "loss": 0.0, "num_tokens": 123450796.0, "reward": 0.796875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1492 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.289605734767025, "grad_norm": 0.0, "learning_rate": 8.921895705662193e-07, "loss": 0.0, "num_tokens": 123521096.0, "reward": 0.9375, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1493 }, { "advantages/mean": 5.122274160385132e-09, "advantages/snr": 7.745951594996348e-09, "advantages/std": 0.6612840294837952, "advantages/var": 0.4372965676503249, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.29247311827957, "grad_norm": 0.13912726542928147, "learning_rate": 8.910709851170467e-07, "loss": -0.0, "num_tokens": 123610144.0, "reward": 0.71875, "reward_std": 0.22119548916816711, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 1494 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 1.1266523706756892e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.295340501792115, "grad_norm": 0.05994487449854081, "learning_rate": 8.899525375831731e-07, "loss": -0.0, "num_tokens": 123687905.0, "reward": 0.8203125, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1495 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.9792225930357895e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.2982078853046595, "grad_norm": 0.1009806286692767, "learning_rate": 8.888342293806689e-07, "loss": 0.0, "num_tokens": 123773636.0, "reward": 0.828125, "reward_std": 0.11678344756364822, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1496 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.301075268817204, "grad_norm": 0.0, "learning_rate": 8.877160619254264e-07, "loss": 0.0, "num_tokens": 123842388.0, "reward": 0.9375, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1497 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.3039426523297495, "grad_norm": 0.09998205361814819, "learning_rate": 8.865980366331606e-07, "loss": 0.0, "num_tokens": 123926373.0, "reward": 0.8125, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1498 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 8.049402934764073e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.306810035842294, "grad_norm": 0.0875716518529315, "learning_rate": 8.854801549194054e-07, "loss": 0.0, "num_tokens": 124000519.0, "reward": 0.890625, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1499 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299846843318419e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.309677419354839, "grad_norm": 0.07652125021746135, "learning_rate": 8.843624181995144e-07, "loss": -0.0, "num_tokens": 124092098.0, "reward": 0.6796875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 1500 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917003347966285e-09, "advantages/std": 0.4676017463207245, "advantages/var": 0.21865139316219118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.312544802867383, "grad_norm": 0.09695003076082673, "learning_rate": 8.832448278886566e-07, "loss": -0.0, "num_tokens": 124169758.0, "reward": 0.8515625, "reward_std": 0.11336849629878998, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1501 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.315412186379929, "grad_norm": 0.07080920782055285, "learning_rate": 8.821273854018162e-07, "loss": -0.0, "num_tokens": 124248394.0, "reward": 0.8046875, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1502 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.318279569892473, "grad_norm": 0.06631141915455245, "learning_rate": 8.810100921537893e-07, "loss": 0.0, "num_tokens": 124313055.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1503 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907227504745508e-10, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.321146953405018, "grad_norm": 0.12112345623153241, "learning_rate": 8.798929495591839e-07, "loss": 0.0, "num_tokens": 124401221.0, "reward": 0.78125, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1504 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958849501312727e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.324014336917562, "grad_norm": 0.14301317683757267, "learning_rate": 8.787759590324175e-07, "loss": 0.0, "num_tokens": 124470343.0, "reward": 0.890625, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1505 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.326881720430108, "grad_norm": 0.061120960977824365, "learning_rate": 8.776591219877145e-07, "loss": 0.0, "num_tokens": 124550338.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1506 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5056415302478586e-09, "advantages/std": 0.6185553073883057, "advantages/var": 0.3826106682982413, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.329749103942652, "grad_norm": 0.1754009073013372, "learning_rate": 8.765424398391046e-07, "loss": -0.0, "num_tokens": 124638848.0, "reward": 0.6953125, "reward_std": 0.16834919154644012, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 1507 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.379887186086637e-08, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.332616487455197, "grad_norm": 0.08519214614169628, "learning_rate": 8.75425914000422e-07, "loss": 0.0, "num_tokens": 124722132.0, "reward": 0.8828125, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1508 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.494855892459685e-08, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.335483870967742, "grad_norm": 0.08760196360323062, "learning_rate": 8.743095458853032e-07, "loss": 0.0, "num_tokens": 124800189.0, "reward": 0.8203125, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1509 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.338351254480287, "grad_norm": 0.05702860630164658, "learning_rate": 8.731933369071849e-07, "loss": -0.0, "num_tokens": 124876940.0, "reward": 0.859375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1510 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9875550720364307e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.3412186379928315, "grad_norm": 0.08605429619390997, "learning_rate": 8.720772884793015e-07, "loss": -0.0, "num_tokens": 124960083.0, "reward": 0.921875, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1511 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 7.967066906198935e-09, "advantages/std": 0.4675861597061157, "advantages/var": 0.21863681674871316, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.344086021505376, "grad_norm": 0.11213886305841794, "learning_rate": 8.70961402014685e-07, "loss": 0.0, "num_tokens": 125040014.0, "reward": 0.9453125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1512 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344384639658041e-09, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 4.3469534050179215, "grad_norm": 0.14903303408755642, "learning_rate": 8.698456789261616e-07, "loss": 0.0, "num_tokens": 125121396.0, "reward": 0.78125, "reward_std": 0.13098981976509094, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1513 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1498891480459116e-08, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.349820788530466, "grad_norm": 0.07735068537130135, "learning_rate": 8.687301206263518e-07, "loss": 0.0, "num_tokens": 125199687.0, "reward": 0.9453125, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1514 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.0954489382432772e-08, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.352688172043011, "grad_norm": 0.13479114473642162, "learning_rate": 8.676147285276667e-07, "loss": 0.0, "num_tokens": 125272048.0, "reward": 0.875, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1515 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5055885760084386e-09, "advantages/std": 0.6185770630836487, "advantages/var": 0.3826375829731923, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.355555555555555, "grad_norm": 0.14266694682150755, "learning_rate": 8.664995040423067e-07, "loss": -0.0, "num_tokens": 125363968.0, "reward": 0.78125, "reward_std": 0.19568344950675964, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1516 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.358422939068101, "grad_norm": 0.12138187059214987, "learning_rate": 8.653844485822602e-07, "loss": -0.0, "num_tokens": 125438839.0, "reward": 0.7421875, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1517 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2525469477123842e-09, "advantages/std": 0.5726727843284607, "advantages/var": 0.32795411791051166, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 4.361290322580645, "grad_norm": 0.11781333290531952, "learning_rate": 8.642695635593023e-07, "loss": 0.0, "num_tokens": 125529165.0, "reward": 0.828125, "reward_std": 0.1462521106004715, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1518 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5629381233631915e-09, "advantages/std": 0.5227834582328796, "advantages/var": 0.273302544201929, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 4.36415770609319, "grad_norm": 0.11450533395385713, "learning_rate": 8.631548503849915e-07, "loss": -0.0, "num_tokens": 125613193.0, "reward": 0.7421875, "reward_std": 0.13098490238189697, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1519 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.367025089605734, "grad_norm": 0.1717613200259846, "learning_rate": 8.620403104706686e-07, "loss": 0.0, "num_tokens": 125680840.0, "reward": 0.8671875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1520 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380984365940094, "advantages/var": 0.05466704299203351, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.36989247311828, "grad_norm": 0.05959738383725868, "learning_rate": 8.609259452274557e-07, "loss": 0.0, "num_tokens": 125751046.0, "reward": 0.71875, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 1521 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.372759856630824, "grad_norm": 0.1400111794092891, "learning_rate": 8.598117560662533e-07, "loss": -0.0, "num_tokens": 125825873.0, "reward": 0.953125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 1522 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 4.375627240143369, "grad_norm": 0.041986627317893196, "learning_rate": 8.586977443977396e-07, "loss": -0.0, "num_tokens": 125904104.0, "reward": 0.7578125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1523 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917033813576203e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 4.378494623655914, "grad_norm": 0.11661446312178016, "learning_rate": 8.575839116323669e-07, "loss": 0.0, "num_tokens": 125996254.0, "reward": 0.796875, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1524 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 6.971194650918909e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 4.381362007168459, "grad_norm": 0.09182377957203804, "learning_rate": 8.564702591803619e-07, "loss": -0.0, "num_tokens": 126077138.0, "reward": 0.78125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1525 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.9875832530345343e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.3842293906810035, "grad_norm": 0.15097119268491282, "learning_rate": 8.553567884517226e-07, "loss": 0.0, "num_tokens": 126145191.0, "reward": 0.8828125, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1526 }, { "advantages/mean": -7.916241884231567e-09, "advantages/snr": 1.6929100023819388e-08, "advantages/std": 0.46761149168014526, "advantages/var": 0.21866050715133056, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.387096774193548, "grad_norm": 0.12967105364753814, "learning_rate": 8.54243500856218e-07, "loss": 0.0, "num_tokens": 126226663.0, "reward": 0.828125, "reward_std": 0.12255740165710449, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1527 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814751550759118e-09, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.3899641577060935, "grad_norm": 0.11527739695198473, "learning_rate": 8.531303978033829e-07, "loss": 0.0, "num_tokens": 126298221.0, "reward": 0.8671875, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1528 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.392831541218638, "grad_norm": 0.1883962631871225, "learning_rate": 8.520174807025209e-07, "loss": -0.0, "num_tokens": 126364653.0, "reward": 0.9453125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1529 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.979258453394051e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 4.395698924731183, "grad_norm": 0.09878037298414231, "learning_rate": 8.509047509626987e-07, "loss": -0.0, "num_tokens": 126458094.0, "reward": 0.703125, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 1530 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 4.398566308243727, "grad_norm": 0.03007972669000544, "learning_rate": 8.497922099927468e-07, "loss": -0.0, "num_tokens": 126532719.0, "reward": 0.8203125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1531 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199591840825068e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.401433691756273, "grad_norm": 0.13238509731892936, "learning_rate": 8.486798592012552e-07, "loss": 0.0, "num_tokens": 126608679.0, "reward": 0.96875, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1532 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.958683201273463e-10, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.404301075268817, "grad_norm": 0.0757346675969034, "learning_rate": 8.475676999965746e-07, "loss": 0.0, "num_tokens": 126690558.0, "reward": 0.9296875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1533 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6721682514236524e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.407168458781362, "grad_norm": 0.11811042309308818, "learning_rate": 8.464557337868126e-07, "loss": 0.0, "num_tokens": 126767154.0, "reward": 0.703125, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 1534 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.5628345390257e-09, "advantages/std": 0.5227986574172974, "advantages/var": 0.27331843619732865, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.410035842293907, "grad_norm": 0.13639475877953225, "learning_rate": 8.453439619798324e-07, "loss": -0.0, "num_tokens": 126844983.0, "reward": 0.7890625, "reward_std": 0.14677615463733673, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1535 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.412903225806452, "grad_norm": 0.08356876229843305, "learning_rate": 8.442323859832508e-07, "loss": -0.0, "num_tokens": 126926652.0, "reward": 0.828125, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1536 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 9.95862671130252e-09, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.415770609318996, "grad_norm": 0.11748319686103294, "learning_rate": 8.43121007204437e-07, "loss": 0.0, "num_tokens": 127001488.0, "reward": 0.859375, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1537 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.63344972278721e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.418637992831541, "grad_norm": 0.0793231963490384, "learning_rate": 8.420098270505108e-07, "loss": 0.0, "num_tokens": 127072722.0, "reward": 0.96875, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1538 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.421505376344086, "grad_norm": 0.055645089131594244, "learning_rate": 8.408988469283402e-07, "loss": -0.0, "num_tokens": 127144584.0, "reward": 0.875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1539 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.424372759856631, "grad_norm": 0.056100176748938295, "learning_rate": 8.397880682445396e-07, "loss": 0.0, "num_tokens": 127219968.0, "reward": 0.921875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1540 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.4272401433691755, "grad_norm": 0.10463434059866998, "learning_rate": 8.386774924054685e-07, "loss": -0.0, "num_tokens": 127296150.0, "reward": 0.765625, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1541 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.43010752688172, "grad_norm": 0.05977362799968743, "learning_rate": 8.375671208172304e-07, "loss": 0.0, "num_tokens": 127361298.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 1542 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46761220693588257, "advantages/var": 0.21866117607544666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.4329749103942655, "grad_norm": 0.09174086666333871, "learning_rate": 8.364569548856694e-07, "loss": -0.0, "num_tokens": 127449389.0, "reward": 0.8359375, "reward_std": 0.1236182302236557, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1543 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.2674839170101503e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.43584229390681, "grad_norm": 0.0755987201652617, "learning_rate": 8.353469960163689e-07, "loss": 0.0, "num_tokens": 127519611.0, "reward": 0.8359375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1544 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.4948445614343708e-08, "advantages/std": 0.40496495366096497, "advantages/var": 0.1639966136936275, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.438709677419355, "grad_norm": 0.08453683000937354, "learning_rate": 8.34237245614651e-07, "loss": 0.0, "num_tokens": 127606813.0, "reward": 0.75, "reward_std": 0.0936255231499672, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1545 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907189930094833e-09, "advantages/std": 0.5227925777435303, "advantages/var": 0.27331207934372515, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.4415770609319, "grad_norm": 0.13134660568169892, "learning_rate": 8.331277050855732e-07, "loss": 0.0, "num_tokens": 127685029.0, "reward": 0.8984375, "reward_std": 0.13782459497451782, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1546 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.444444444444445, "grad_norm": 0.05309177452297175, "learning_rate": 8.320183758339283e-07, "loss": 0.0, "num_tokens": 127762373.0, "reward": 0.71875, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 1547 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.447311827956989, "grad_norm": 0.13427940408964043, "learning_rate": 8.309092592642401e-07, "loss": 0.0, "num_tokens": 127838504.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1548 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 4.450179211469534, "grad_norm": 0.11582365325455868, "learning_rate": 8.29800356780764e-07, "loss": -0.0, "num_tokens": 127928687.0, "reward": 0.7578125, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1549 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.453046594982079, "grad_norm": 0.053111368806384486, "learning_rate": 8.286916697874841e-07, "loss": 0.0, "num_tokens": 128004872.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1550 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262492693233955e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.455913978494624, "grad_norm": 0.11176594266265587, "learning_rate": 8.275831996881127e-07, "loss": 0.0, "num_tokens": 128095545.0, "reward": 0.828125, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1551 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975244195968941e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.458781362007168, "grad_norm": 0.13524869902232045, "learning_rate": 8.264749478860853e-07, "loss": 0.0, "num_tokens": 128170833.0, "reward": 0.875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1552 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.461648745519713, "grad_norm": 0.07916711345930293, "learning_rate": 8.253669157845631e-07, "loss": 0.0, "num_tokens": 128235550.0, "reward": 0.875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1553 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.464516129032258, "grad_norm": 0.09645282716884043, "learning_rate": 8.24259104786428e-07, "loss": 0.0, "num_tokens": 128310662.0, "reward": 0.875, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1554 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.467383512544803, "grad_norm": 0.07946594741567561, "learning_rate": 8.231515162942822e-07, "loss": 0.0, "num_tokens": 128380371.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1555 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.4702508960573475, "grad_norm": 0.08806504776261509, "learning_rate": 8.220441517104471e-07, "loss": 0.0, "num_tokens": 128463092.0, "reward": 0.7890625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1556 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.4083154633446115e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.473118279569892, "grad_norm": 0.05649365302533599, "learning_rate": 8.209370124369588e-07, "loss": -0.0, "num_tokens": 128546682.0, "reward": 0.8828125, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1557 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.0954106789488524e-08, "advantages/std": 0.46761220693588257, "advantages/var": 0.21866117607544666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.4759856630824375, "grad_norm": 0.10687162860127353, "learning_rate": 8.198300998755696e-07, "loss": -0.0, "num_tokens": 128628640.0, "reward": 0.8515625, "reward_std": 0.1236182302236557, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1558 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962614376833355e-09, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.478853046594982, "grad_norm": 0.10378706897507785, "learning_rate": 8.187234154277439e-07, "loss": 0.0, "num_tokens": 128711989.0, "reward": 0.8359375, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1559 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.971194650918909e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.481720430107527, "grad_norm": 0.1279554688962821, "learning_rate": 8.176169604946586e-07, "loss": 0.0, "num_tokens": 128790667.0, "reward": 0.78125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1560 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.484587813620072, "grad_norm": 0.13033485611719636, "learning_rate": 8.165107364771978e-07, "loss": 0.0, "num_tokens": 128875084.0, "reward": 0.9140625, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1561 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6612711548805237, "advantages/var": 0.43727954027702154, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.487455197132617, "grad_norm": 0.13625062308115246, "learning_rate": 8.154047447759554e-07, "loss": 0.0, "num_tokens": 128968539.0, "reward": 0.8203125, "reward_std": 0.20411096513271332, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1562 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.490322580645161, "grad_norm": 0.09146346001930003, "learning_rate": 8.142989867912298e-07, "loss": 0.0, "num_tokens": 129038631.0, "reward": 0.96875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1563 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299846843318419e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.493189964157706, "grad_norm": 0.14823129251144393, "learning_rate": 8.131934639230244e-07, "loss": 0.0, "num_tokens": 129117638.0, "reward": 0.8671875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1564 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5629503101518235e-09, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.496057347670251, "grad_norm": 0.09899485072652606, "learning_rate": 8.12088177571044e-07, "loss": 0.0, "num_tokens": 129204605.0, "reward": 0.8046875, "reward_std": 0.12863080203533173, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1565 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.258373724324303e-09, "advantages/std": 0.6185795664787292, "advantages/var": 0.3826406800650126, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.498924731182796, "grad_norm": 0.13593794888110886, "learning_rate": 8.109831291346948e-07, "loss": -0.0, "num_tokens": 129289557.0, "reward": 0.78125, "reward_std": 0.2001592218875885, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1566 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.50179211469534, "grad_norm": 0.07242409317259144, "learning_rate": 8.098783200130812e-07, "loss": 0.0, "num_tokens": 129369624.0, "reward": 0.8203125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1567 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9835626514248234e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.504659498207886, "grad_norm": 0.09399296263010991, "learning_rate": 8.087737516050053e-07, "loss": 0.0, "num_tokens": 129446421.0, "reward": 0.9609375, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 1568 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.50752688172043, "grad_norm": 0.09191536758570197, "learning_rate": 8.076694253089631e-07, "loss": 0.0, "num_tokens": 129515430.0, "reward": 0.875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1569 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.626265687704098e-09, "advantages/std": 0.5726755261421204, "advantages/var": 0.3279572582421544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.510394265232975, "grad_norm": 0.1522348482437637, "learning_rate": 8.065653425231452e-07, "loss": 0.0, "num_tokens": 129598753.0, "reward": 0.859375, "reward_std": 0.15072788298130035, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1570 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.5132616487455195, "grad_norm": 0.0, "learning_rate": 8.05461504645434e-07, "loss": 0.0, "num_tokens": 129669733.0, "reward": 0.875, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1571 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917366402546925e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.516129032258064, "grad_norm": 0.1251916490484711, "learning_rate": 8.043579130734013e-07, "loss": 0.0, "num_tokens": 129741109.0, "reward": 0.9296875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1572 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2524951534513563e-09, "advantages/std": 0.5726819038391113, "advantages/var": 0.32796456298478915, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.5189964157706095, "grad_norm": 0.14549734052264987, "learning_rate": 8.032545692043068e-07, "loss": -0.0, "num_tokens": 129831912.0, "reward": 0.8203125, "reward_std": 0.15756267309188843, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1573 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.979139449767511e-09, "advantages/std": 0.46761220693588257, "advantages/var": 0.21866117607544666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.521863799283154, "grad_norm": 0.11984844576041742, "learning_rate": 8.021514744350969e-07, "loss": -0.0, "num_tokens": 129911905.0, "reward": 0.6171875, "reward_std": 0.1236182302236557, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.4879830479621887, "step": 1574 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 4.524731182795699, "grad_norm": 0.1081565198875998, "learning_rate": 8.010486301624032e-07, "loss": -0.0, "num_tokens": 130005519.0, "reward": 0.6484375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 1575 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998026679709143e-09, "advantages/std": 0.4049575924873352, "advantages/var": 0.16399065171313865, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.527598566308244, "grad_norm": 0.1116032107301051, "learning_rate": 7.999460377825395e-07, "loss": 0.0, "num_tokens": 130082566.0, "reward": 0.8046875, "reward_std": 0.08891239762306213, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1576 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.971194650918909e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.530465949820789, "grad_norm": 0.1638845893683361, "learning_rate": 7.988436986915003e-07, "loss": -0.0, "num_tokens": 130165929.0, "reward": 0.84375, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1577 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983473280509385e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.533333333333333, "grad_norm": 0.13295662249426007, "learning_rate": 7.977416142849605e-07, "loss": 0.0, "num_tokens": 130241029.0, "reward": 0.8046875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1578 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40496885776519775, "advantages/var": 0.16399977575964897, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.536200716845878, "grad_norm": 0.10972079286238992, "learning_rate": 7.966397859582712e-07, "loss": -0.0, "num_tokens": 130322199.0, "reward": 0.84375, "reward_std": 0.09810130298137665, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1579 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.1258055642951985e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 4.539068100358423, "grad_norm": 0.12422660645392816, "learning_rate": 7.955382151064609e-07, "loss": 0.0, "num_tokens": 130409780.0, "reward": 0.609375, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.4898075461387634, "step": 1580 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6262611176060706e-09, "advantages/std": 0.5726771354675293, "advantages/var": 0.3279591014872949, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.541935483870968, "grad_norm": 0.0780531493375495, "learning_rate": 7.944369031242306e-07, "loss": 0.0, "num_tokens": 130495116.0, "reward": 0.828125, "reward_std": 0.1530819982290268, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1581 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.544802867383512, "grad_norm": 0.09287131038371223, "learning_rate": 7.933358514059542e-07, "loss": 0.0, "num_tokens": 130574069.0, "reward": 0.9296875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1582 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.547670250896058, "grad_norm": 0.16809308550778532, "learning_rate": 7.922350613456763e-07, "loss": 0.0, "num_tokens": 130654253.0, "reward": 0.890625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1583 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.550537634408602, "grad_norm": 0.06683962956290307, "learning_rate": 7.911345343371103e-07, "loss": -0.0, "num_tokens": 130733105.0, "reward": 0.8515625, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1584 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.553405017921147, "grad_norm": 0.09373972474736782, "learning_rate": 7.900342717736353e-07, "loss": -0.0, "num_tokens": 130822672.0, "reward": 0.8203125, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1585 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.5562724014336915, "grad_norm": 0.10264809816037185, "learning_rate": 7.88934275048297e-07, "loss": -0.0, "num_tokens": 130897567.0, "reward": 0.953125, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 1586 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.559139784946236, "grad_norm": 0.10525879291354667, "learning_rate": 7.878345455538043e-07, "loss": 0.0, "num_tokens": 130975683.0, "reward": 0.90625, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1587 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494468808174133, "advantages/var": 0.16398020040561878, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.5620071684587815, "grad_norm": 0.09257790509906005, "learning_rate": 7.867350846825271e-07, "loss": 0.0, "num_tokens": 131055145.0, "reward": 0.84375, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1588 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199591840825068e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.564874551971326, "grad_norm": 0.1446030853921875, "learning_rate": 7.856358938264953e-07, "loss": 0.0, "num_tokens": 131133287.0, "reward": 0.90625, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1589 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 1.126665165824288e-08, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.567741935483871, "grad_norm": 0.06268478241855875, "learning_rate": 7.84536974377398e-07, "loss": -0.0, "num_tokens": 131197738.0, "reward": 0.96875, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1590 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.2998246708054356e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.570609318996416, "grad_norm": 0.09331097190519154, "learning_rate": 7.834383277265792e-07, "loss": 0.0, "num_tokens": 131275636.0, "reward": 0.7890625, "reward_std": 0.08443662524223328, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1591 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983628920516591e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.573476702508961, "grad_norm": 0.18006050372622187, "learning_rate": 7.823399552650383e-07, "loss": 0.0, "num_tokens": 131361700.0, "reward": 0.734375, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1592 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983467187183905e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.576344086021505, "grad_norm": 0.16944149752660384, "learning_rate": 7.812418583834281e-07, "loss": -0.0, "num_tokens": 131434842.0, "reward": 0.90625, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1593 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917221686896894e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.57921146953405, "grad_norm": 0.1036148638309728, "learning_rate": 7.801440384720509e-07, "loss": -0.0, "num_tokens": 131513412.0, "reward": 0.7890625, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1594 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.96706741399221e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.582078853046595, "grad_norm": 0.16731095967597007, "learning_rate": 7.790464969208597e-07, "loss": -0.0, "num_tokens": 131591086.0, "reward": 0.8359375, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1595 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.5050938954247684e-09, "advantages/std": 0.5726727843284607, "advantages/var": 0.32795411791051166, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.58494623655914, "grad_norm": 0.14980625968664033, "learning_rate": 7.779492351194546e-07, "loss": 0.0, "num_tokens": 131676647.0, "reward": 0.828125, "reward_std": 0.1462521106004715, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1596 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227833986282349, "advantages/var": 0.2733024818812879, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 4.587813620071684, "grad_norm": 0.25115726970915364, "learning_rate": 7.768522544570817e-07, "loss": 0.0, "num_tokens": 131753930.0, "reward": 0.8515625, "reward_std": 0.13098490238189697, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1597 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.2998806953079044e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.59068100358423, "grad_norm": 0.07780192650417037, "learning_rate": 7.757555563226305e-07, "loss": 0.0, "num_tokens": 131837580.0, "reward": 0.9453125, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1598 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917699002625455e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.593548387096774, "grad_norm": 0.0806428245538292, "learning_rate": 7.746591421046335e-07, "loss": -0.0, "num_tokens": 131916844.0, "reward": 0.765625, "reward_std": 0.09863808006048203, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1599 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 4.596415770609319, "grad_norm": 0.0723027089703405, "learning_rate": 7.735630131912637e-07, "loss": -0.0, "num_tokens": 132008689.0, "reward": 0.7890625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1600 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.5992831541218635, "grad_norm": 0.08158367358736894, "learning_rate": 7.724671709703326e-07, "loss": 0.0, "num_tokens": 132073467.0, "reward": 0.9140625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1601 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 7.528079978419126e-09, "advantages/std": 0.6185657978057861, "advantages/var": 0.3826236462151087, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.602150537634409, "grad_norm": 0.14388181305992928, "learning_rate": 7.713716168292887e-07, "loss": -0.0, "num_tokens": 132155281.0, "reward": 0.8515625, "reward_std": 0.1830746978521347, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1602 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.2674839170101503e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.6050179211469535, "grad_norm": 0.06219203290418096, "learning_rate": 7.702763521552153e-07, "loss": -0.0, "num_tokens": 132241545.0, "reward": 0.8984375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1603 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.5995914574054795e-09, "advantages/std": 0.4049588143825531, "advantages/var": 0.1639916413461231, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.607885304659498, "grad_norm": 0.09435091549837735, "learning_rate": 7.691813783348308e-07, "loss": -0.0, "num_tokens": 132321404.0, "reward": 0.921875, "reward_std": 0.0867956355214119, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1604 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907308748317195e-09, "advantages/std": 0.5227856040000916, "advantages/var": 0.27330478774974054, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 4.610752688172043, "grad_norm": 0.14888850594239775, "learning_rate": 7.68086696754484e-07, "loss": 0.0, "num_tokens": 132400975.0, "reward": 0.828125, "reward_std": 0.13098981976509094, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1605 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599693686636838e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.613620071684588, "grad_norm": 0.10091741032199689, "learning_rate": 7.669923088001537e-07, "loss": 0.0, "num_tokens": 132481527.0, "reward": 0.7734375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1606 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814615465526806e-09, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.616487455197133, "grad_norm": 0.12915255405954504, "learning_rate": 7.658982158574469e-07, "loss": -0.0, "num_tokens": 132564893.0, "reward": 0.703125, "reward_std": 0.13098981976509094, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 1607 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.29459053084411e-08, "advantages/std": 0.4676070511341095, "advantages/var": 0.2186563542703377, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.619354838709677, "grad_norm": 0.10581661925296995, "learning_rate": 7.648044193115983e-07, "loss": 0.0, "num_tokens": 132640397.0, "reward": 0.9140625, "reward_std": 0.12019838392734528, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1608 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.2470340303516295e-08, "advantages/std": 0.5227810740470886, "advantages/var": 0.27330005138182756, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.622222222222222, "grad_norm": 0.12392200488142388, "learning_rate": 7.637109205474663e-07, "loss": 0.0, "num_tokens": 132725080.0, "reward": 0.78125, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1609 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.625089605734767, "grad_norm": 0.10554181629616402, "learning_rate": 7.626177209495319e-07, "loss": 0.0, "num_tokens": 132789271.0, "reward": 0.90625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1610 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.627956989247312, "grad_norm": 0.086996552301831, "learning_rate": 7.615248219018981e-07, "loss": -0.0, "num_tokens": 132871722.0, "reward": 0.7421875, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1611 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.149940263022739e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.630824372759856, "grad_norm": 0.14215844478089376, "learning_rate": 7.60432224788287e-07, "loss": 0.0, "num_tokens": 132946368.0, "reward": 0.8203125, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1612 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.599795920412534e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.633691756272402, "grad_norm": 0.14701246559534492, "learning_rate": 7.593399309920393e-07, "loss": -0.0, "num_tokens": 133021640.0, "reward": 0.828125, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1613 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917480653229804e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.636559139784946, "grad_norm": 0.11497633964923037, "learning_rate": 7.582479418961101e-07, "loss": -0.0, "num_tokens": 133097958.0, "reward": 0.796875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1614 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 4.639426523297491, "grad_norm": 0.05788553387589012, "learning_rate": 7.571562588830697e-07, "loss": 0.0, "num_tokens": 133191439.0, "reward": 0.7734375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1615 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.763983771436959e-09, "advantages/std": 0.6185750365257263, "advantages/var": 0.38263507581280365, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.6422939068100355, "grad_norm": 0.14491687864547192, "learning_rate": 7.560648833351007e-07, "loss": 0.0, "num_tokens": 133274277.0, "reward": 0.7265625, "reward_std": 0.1922685205936432, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 1616 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.645161290322581, "grad_norm": 0.12258378917704145, "learning_rate": 7.54973816633997e-07, "loss": 0.0, "num_tokens": 133346144.0, "reward": 0.8671875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1617 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599521727490371e-09, "advantages/std": 0.40496495366096497, "advantages/var": 0.1639966136936275, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.6480286738351255, "grad_norm": 0.07265957256575328, "learning_rate": 7.538830601611599e-07, "loss": -0.0, "num_tokens": 133431302.0, "reward": 0.796875, "reward_std": 0.0936255231499672, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1618 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.65089605734767, "grad_norm": 0.08736898389564368, "learning_rate": 7.527926152975999e-07, "loss": 0.0, "num_tokens": 133508245.0, "reward": 0.9375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1619 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.6537634408602155, "grad_norm": 0.027703098343415788, "learning_rate": 7.517024834239311e-07, "loss": -0.0, "num_tokens": 133594592.0, "reward": 0.6953125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 1620 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.749701315113695e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.65663082437276, "grad_norm": 0.08427394693962005, "learning_rate": 7.506126659203732e-07, "loss": 0.0, "num_tokens": 133669055.0, "reward": 0.7265625, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 1621 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.659498207885305, "grad_norm": 0.05510938332542658, "learning_rate": 7.495231641667458e-07, "loss": -0.0, "num_tokens": 133751963.0, "reward": 0.890625, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1622 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 4.662365591397849, "grad_norm": 0.05035489070551885, "learning_rate": 7.484339795424705e-07, "loss": 0.0, "num_tokens": 133827487.0, "reward": 0.7578125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1623 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262673803395154e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.665232974910394, "grad_norm": 0.15364144928913698, "learning_rate": 7.473451134265665e-07, "loss": 0.0, "num_tokens": 133915334.0, "reward": 0.7578125, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1624 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.668100358422939, "grad_norm": 0.10363090295376025, "learning_rate": 7.462565671976503e-07, "loss": -0.0, "num_tokens": 133989607.0, "reward": 0.8984375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1625 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.670967741935484, "grad_norm": 0.07421035931391864, "learning_rate": 7.451683422339323e-07, "loss": -0.0, "num_tokens": 134060459.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1626 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.673835125448028, "grad_norm": 0.13227855390169405, "learning_rate": 7.440804399132172e-07, "loss": 0.0, "num_tokens": 134126741.0, "reward": 0.875, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1627 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.5049727039524085e-09, "advantages/std": 0.5726834535598755, "advantages/var": 0.32796633798126607, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 4.676702508960574, "grad_norm": 0.19359569030053653, "learning_rate": 7.429928616129009e-07, "loss": 0.0, "num_tokens": 134215282.0, "reward": 0.8046875, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1628 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 5.63344972278721e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.679569892473118, "grad_norm": 0.05907275427514059, "learning_rate": 7.419056087099694e-07, "loss": -0.0, "num_tokens": 134300129.0, "reward": 0.765625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1629 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.4393256658538594e-09, "advantages/std": 0.5726926326751709, "advantages/var": 0.3279768515204182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.682437275985663, "grad_norm": 0.1537068880657909, "learning_rate": 7.408186825809957e-07, "loss": -0.0, "num_tokens": 134374852.0, "reward": 0.875, "reward_std": 0.1712273508310318, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1630 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.6853046594982075, "grad_norm": 0.13636365349874846, "learning_rate": 7.397320846021397e-07, "loss": 0.0, "num_tokens": 134449949.0, "reward": 0.921875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1631 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299802498719973e-09, "advantages/std": 0.4049576222896576, "advantages/var": 0.16399067585049298, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.688172043010753, "grad_norm": 0.12425002393391864, "learning_rate": 7.386458161491465e-07, "loss": -0.0, "num_tokens": 134523285.0, "reward": 0.8671875, "reward_std": 0.08891239762306213, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1632 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.6910394265232975, "grad_norm": 0.09466145442952983, "learning_rate": 7.375598785973429e-07, "loss": 0.0, "num_tokens": 134600090.0, "reward": 0.7890625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1633 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633180108710322e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.693906810035842, "grad_norm": 0.11381714746002539, "learning_rate": 7.364742733216372e-07, "loss": 0.0, "num_tokens": 134673298.0, "reward": 0.9453125, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1634 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.979139449767511e-09, "advantages/std": 0.46761220693588257, "advantages/var": 0.21866117607544666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.6967741935483875, "grad_norm": 0.10872391394227172, "learning_rate": 7.353890016965169e-07, "loss": 0.0, "num_tokens": 134748297.0, "reward": 0.8203125, "reward_std": 0.1236182302236557, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1635 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.699641577060932, "grad_norm": 0.0, "learning_rate": 7.343040650960469e-07, "loss": 0.0, "num_tokens": 134823428.0, "reward": 0.9375, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1636 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998246708054356e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.702508960573477, "grad_norm": 0.064011796942797, "learning_rate": 7.332194648938688e-07, "loss": -0.0, "num_tokens": 134903274.0, "reward": 0.8984375, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1637 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.705376344086021, "grad_norm": 0.12022436045089688, "learning_rate": 7.321352024631973e-07, "loss": -0.0, "num_tokens": 134986570.0, "reward": 0.9296875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1638 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450174584180815e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.708243727598567, "grad_norm": 0.07791697208110278, "learning_rate": 7.310512791768198e-07, "loss": 0.0, "num_tokens": 135070456.0, "reward": 0.84375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1639 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.711111111111111, "grad_norm": 0.08557264076533448, "learning_rate": 7.299676964070938e-07, "loss": 0.0, "num_tokens": 135140402.0, "reward": 0.8984375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1640 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.749701315113695e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.713978494623656, "grad_norm": 0.07883140905174156, "learning_rate": 7.288844555259471e-07, "loss": 0.0, "num_tokens": 135224668.0, "reward": 0.8515625, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1641 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 8.049325332145131e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.7168458781362, "grad_norm": 0.110481504205288, "learning_rate": 7.278015579048734e-07, "loss": 0.0, "num_tokens": 135301935.0, "reward": 0.8125, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1642 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.0954368597466913e-08, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.719713261648746, "grad_norm": 0.10336929955002645, "learning_rate": 7.267190049149317e-07, "loss": 0.0, "num_tokens": 135384382.0, "reward": 0.8125, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1643 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9875211793780565e-09, "advantages/std": 0.4676063358783722, "advantages/var": 0.21865568535359703, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.72258064516129, "grad_norm": 0.09791630050111359, "learning_rate": 7.256367979267455e-07, "loss": -0.0, "num_tokens": 135467754.0, "reward": 0.828125, "reward_std": 0.11913755536079407, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1644 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 6.970961834751672e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.725448028673835, "grad_norm": 0.10299045133209574, "learning_rate": 7.245549383104992e-07, "loss": 0.0, "num_tokens": 135553008.0, "reward": 0.828125, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1645 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.8994882297977766e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 4.7283154121863795, "grad_norm": 0.14588499031325744, "learning_rate": 7.234734274359388e-07, "loss": 0.0, "num_tokens": 135639384.0, "reward": 0.828125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1646 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.527839878197673e-10, "advantages/std": 0.6185855269432068, "advantages/var": 0.3826480541436048, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 4.731182795698925, "grad_norm": 0.17317502917605887, "learning_rate": 7.223922666723676e-07, "loss": -0.0, "num_tokens": 135719633.0, "reward": 0.6953125, "reward_std": 0.20699402689933777, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 1647 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.7340501792114695, "grad_norm": 0.13800906614522476, "learning_rate": 7.213114573886458e-07, "loss": -0.0, "num_tokens": 135794499.0, "reward": 0.8046875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1648 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599693686636838e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.736917562724014, "grad_norm": 0.06521309150206973, "learning_rate": 7.202310009531884e-07, "loss": 0.0, "num_tokens": 135872734.0, "reward": 0.9296875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1649 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 8.049402934764073e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.7397849462365595, "grad_norm": 0.0762940262182169, "learning_rate": 7.191508987339654e-07, "loss": 0.0, "num_tokens": 135952386.0, "reward": 0.84375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1650 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149905988405531e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.742652329749104, "grad_norm": 0.12225458348992939, "learning_rate": 7.180711520984952e-07, "loss": 0.0, "num_tokens": 136039969.0, "reward": 0.8046875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1651 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2523980013455208e-09, "advantages/std": 0.5726990103721619, "advantages/var": 0.32798415648125356, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.745519713261649, "grad_norm": 0.15372109186111152, "learning_rate": 7.169917624138488e-07, "loss": 0.0, "num_tokens": 136119659.0, "reward": 0.8671875, "reward_std": 0.17806214094161987, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1652 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.748387096774193, "grad_norm": 0.10231956708089485, "learning_rate": 7.15912731046644e-07, "loss": -0.0, "num_tokens": 136191006.0, "reward": 0.8359375, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1653 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.235142426779239e-09, "advantages/std": 0.5227833986282349, "advantages/var": 0.2733024818812879, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.751254480286739, "grad_norm": 0.13484719765167572, "learning_rate": 7.148340593630452e-07, "loss": 0.0, "num_tokens": 136271001.0, "reward": 0.7265625, "reward_std": 0.13098490238189697, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 1654 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.754121863799283, "grad_norm": 0.08229723941001783, "learning_rate": 7.137557487287607e-07, "loss": -0.0, "num_tokens": 136356037.0, "reward": 0.84375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1655 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.756989247311828, "grad_norm": 0.11561987468906357, "learning_rate": 7.126778005090431e-07, "loss": 0.0, "num_tokens": 136430606.0, "reward": 0.90625, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1656 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962749759103603e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.759856630824372, "grad_norm": 0.20534656263558138, "learning_rate": 7.11600216068685e-07, "loss": 0.0, "num_tokens": 136506150.0, "reward": 0.8671875, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1657 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.762724014336918, "grad_norm": 0.23299202155065596, "learning_rate": 7.105229967720191e-07, "loss": 0.0, "num_tokens": 136583371.0, "reward": 0.78125, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1658 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975166886896984e-09, "advantages/std": 0.4675965905189514, "advantages/var": 0.21864657146494793, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.765591397849462, "grad_norm": 0.09342418994880777, "learning_rate": 7.09446143982915e-07, "loss": 0.0, "num_tokens": 136666709.0, "reward": 0.9296875, "reward_std": 0.10994865745306015, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1659 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.9750579720916185e-09, "advantages/std": 0.4676051139831543, "advantages/var": 0.21865454262319872, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.768458781362007, "grad_norm": 0.08349540862040114, "learning_rate": 7.083696590647786e-07, "loss": 0.0, "num_tokens": 136756219.0, "reward": 0.7890625, "reward_std": 0.11784426867961884, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1660 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1499234216592094e-08, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.7713261648745515, "grad_norm": 0.11801519200360466, "learning_rate": 7.072935433805507e-07, "loss": -0.0, "num_tokens": 136841112.0, "reward": 0.8984375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1661 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2997337848958775e-09, "advantages/std": 0.404969722032547, "advantages/var": 0.16400047576311838, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 4.774193548387097, "grad_norm": 0.06475097052472373, "learning_rate": 7.06217798292704e-07, "loss": 0.0, "num_tokens": 136919042.0, "reward": 0.7890625, "reward_std": 0.09916213154792786, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1662 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 6.970911630250105e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 4.7770609318996415, "grad_norm": 0.11656949784420355, "learning_rate": 7.051424251632418e-07, "loss": -0.0, "num_tokens": 137007284.0, "reward": 0.625, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 1663 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.779928315412186, "grad_norm": 0.0945511456503056, "learning_rate": 7.040674253536965e-07, "loss": 0.0, "num_tokens": 137077723.0, "reward": 0.8515625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1664 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917699002625455e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.7827956989247316, "grad_norm": 0.12004320795593788, "learning_rate": 7.029928002251287e-07, "loss": -0.0, "num_tokens": 137149648.0, "reward": 0.828125, "reward_std": 0.09863808006048203, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1665 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1499234216592094e-08, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.785663082437276, "grad_norm": 0.08839211137410177, "learning_rate": 7.019185511381238e-07, "loss": 0.0, "num_tokens": 137223917.0, "reward": 0.9609375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 1666 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380985856056213, "advantages/var": 0.05466704996011007, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.788530465949821, "grad_norm": 0.03693768958955889, "learning_rate": 7.008446794527909e-07, "loss": 0.0, "num_tokens": 137300499.0, "reward": 0.90625, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1667 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.0112319913676506e-09, "advantages/std": 0.6185657978057861, "advantages/var": 0.3826236462151087, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 4.791397849462365, "grad_norm": 0.11623775362396632, "learning_rate": 6.99771186528762e-07, "loss": -0.0, "num_tokens": 137387181.0, "reward": 0.7734375, "reward_std": 0.1830746978521347, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1668 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.794265232974911, "grad_norm": 0.13093804686208912, "learning_rate": 6.986980737251888e-07, "loss": -0.0, "num_tokens": 137462936.0, "reward": 0.8046875, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1669 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 8.962600667464421e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.797132616487455, "grad_norm": 0.10717130907933577, "learning_rate": 6.976253424007427e-07, "loss": -0.0, "num_tokens": 137538359.0, "reward": 0.578125, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.578125, "rewards/drgrpo_math_reward/std": 0.4957992732524872, "step": 1670 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998246708054356e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.8, "grad_norm": 0.10350714272675278, "learning_rate": 6.965529939136114e-07, "loss": -0.0, "num_tokens": 137614388.0, "reward": 0.8203125, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1671 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.802867383512545, "grad_norm": 0.06582431752065288, "learning_rate": 6.954810296214976e-07, "loss": 0.0, "num_tokens": 137692045.0, "reward": 0.7734375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1672 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.3444254652277355e-09, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.80573476702509, "grad_norm": 0.13532073690127588, "learning_rate": 6.944094508816181e-07, "loss": 0.0, "num_tokens": 137781720.0, "reward": 0.8046875, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1673 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962814881146116e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.808602150537634, "grad_norm": 0.09251980140920273, "learning_rate": 6.933382590507016e-07, "loss": -0.0, "num_tokens": 137856287.0, "reward": 0.8359375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1674 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562997839424082e-09, "advantages/std": 0.5227746963500977, "advantages/var": 0.2732933831439368, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.811469534050179, "grad_norm": 0.08853742787381969, "learning_rate": 6.92267455484987e-07, "loss": 0.0, "num_tokens": 137940285.0, "reward": 0.765625, "reward_std": 0.12179599702358246, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1675 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 4.8143369175627235, "grad_norm": 0.07298975448657384, "learning_rate": 6.91197041540221e-07, "loss": 0.0, "num_tokens": 138029888.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1676 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.817204301075269, "grad_norm": 0.11434426915796067, "learning_rate": 6.901270185716575e-07, "loss": 0.0, "num_tokens": 138108212.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 1677 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.96693437436781e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.8200716845878135, "grad_norm": 0.08609948110605993, "learning_rate": 6.89057387934055e-07, "loss": 0.0, "num_tokens": 138181691.0, "reward": 0.875, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1678 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.1950134223285894e-08, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.822939068100358, "grad_norm": 0.08392182725521677, "learning_rate": 6.879881509816763e-07, "loss": -0.0, "num_tokens": 138270242.0, "reward": 0.875, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1679 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299846843318419e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.825806451612904, "grad_norm": 0.07611886420636763, "learning_rate": 6.869193090682843e-07, "loss": -0.0, "num_tokens": 138348798.0, "reward": 0.7421875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1680 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.828673835125448, "grad_norm": 0.0982792481690089, "learning_rate": 6.858508635471428e-07, "loss": 0.0, "num_tokens": 138420340.0, "reward": 0.9765625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9765625, "rewards/drgrpo_math_reward/std": 0.15188287198543549, "step": 1681 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.425143889473195e-08, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.831541218637993, "grad_norm": 0.10544123657587952, "learning_rate": 6.847828157710127e-07, "loss": 0.0, "num_tokens": 138498723.0, "reward": 0.921875, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1682 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.2998246708054356e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.834408602150537, "grad_norm": 0.20239531167136152, "learning_rate": 6.837151670921533e-07, "loss": -0.0, "num_tokens": 138588563.0, "reward": 0.6171875, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.4879830479621887, "step": 1683 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.837275985663083, "grad_norm": 0.08460167166327032, "learning_rate": 6.82647918862316e-07, "loss": 0.0, "num_tokens": 138681623.0, "reward": 0.8203125, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1684 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.3360541173152519e-08, "advantages/std": 0.5228021144866943, "advantages/var": 0.27332205091175865, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.840143369175627, "grad_norm": 0.09624207245640586, "learning_rate": 6.815810724327468e-07, "loss": 0.0, "num_tokens": 138773832.0, "reward": 0.8203125, "reward_std": 0.15148437023162842, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1685 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633244590331673e-09, "advantages/std": 0.33065226674079895, "advantages/var": 0.10933092150082846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.843010752688172, "grad_norm": 0.11083136432676015, "learning_rate": 6.805146291541831e-07, "loss": -0.0, "num_tokens": 138837254.0, "reward": 0.953125, "reward_std": 0.06233953312039375, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 1686 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983378074428632e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.845878136200717, "grad_norm": 0.1494266370747206, "learning_rate": 6.794485903768512e-07, "loss": 0.0, "num_tokens": 138919092.0, "reward": 0.875, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1687 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344431558649841e-09, "advantages/std": 0.5227810740470886, "advantages/var": 0.27330005138182756, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 4.848745519713262, "grad_norm": 0.1249035905371772, "learning_rate": 6.78382957450465e-07, "loss": 0.0, "num_tokens": 139003387.0, "reward": 0.84375, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1688 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.599649341610871e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.851612903225806, "grad_norm": 0.10591973747412967, "learning_rate": 6.773177317242256e-07, "loss": 0.0, "num_tokens": 139073069.0, "reward": 0.8828125, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1689 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.854480286738351, "grad_norm": 0.05229478359096736, "learning_rate": 6.762529145468179e-07, "loss": -0.0, "num_tokens": 139148797.0, "reward": 0.7578125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1690 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.857347670250896, "grad_norm": 0.0, "learning_rate": 6.751885072664095e-07, "loss": 0.0, "num_tokens": 139222062.0, "reward": 0.875, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1691 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.979341600636731e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.860215053763441, "grad_norm": 0.1356212755508729, "learning_rate": 6.741245112306491e-07, "loss": 0.0, "num_tokens": 139292370.0, "reward": 0.8359375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1692 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.494855892459685e-08, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.8630824372759855, "grad_norm": 0.10644838913526689, "learning_rate": 6.730609277866644e-07, "loss": 0.0, "num_tokens": 139376718.0, "reward": 0.8828125, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1693 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.453574654603735e-09, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.86594982078853, "grad_norm": 0.08988747357697519, "learning_rate": 6.719977582810617e-07, "loss": -0.0, "num_tokens": 139472108.0, "reward": 0.65625, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 1694 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299829409932592e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 4.868817204301076, "grad_norm": 0.11191333046930001, "learning_rate": 6.709350040599226e-07, "loss": -0.0, "num_tokens": 139555776.0, "reward": 0.828125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1695 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.87168458781362, "grad_norm": 0.09852394791062487, "learning_rate": 6.698726664688025e-07, "loss": 0.0, "num_tokens": 139629721.0, "reward": 0.96875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1696 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.97501037071382e-09, "advantages/std": 0.46760883927345276, "advantages/var": 0.21865802656666578, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.874551971326165, "grad_norm": 0.09887033707653936, "learning_rate": 6.688107468527295e-07, "loss": -0.0, "num_tokens": 139718114.0, "reward": 0.6953125, "reward_std": 0.11914245784282684, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 1697 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.877419354838709, "grad_norm": 0.0, "learning_rate": 6.677492465562033e-07, "loss": 0.0, "num_tokens": 139788104.0, "reward": 0.875, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1698 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.504868442276958e-09, "advantages/std": 0.5726926326751709, "advantages/var": 0.3279768515204182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 4.880286738351255, "grad_norm": 0.13135190833765129, "learning_rate": 6.666881669231921e-07, "loss": -0.0, "num_tokens": 139867176.0, "reward": 0.828125, "reward_std": 0.171227365732193, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1699 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958180526526718e-10, "advantages/std": 0.46761682629585266, "advantages/var": 0.21866549623500564, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.883154121863799, "grad_norm": 0.08760053973741729, "learning_rate": 6.656275092971311e-07, "loss": 0.0, "num_tokens": 139951225.0, "reward": 0.75, "reward_std": 0.1293872892856598, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1700 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.886021505376344, "grad_norm": 0.089052736176109, "learning_rate": 6.645672750209214e-07, "loss": 0.0, "num_tokens": 140019646.0, "reward": 0.96875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1701 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.379866977655094e-08, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.888888888888889, "grad_norm": 0.11224937212072089, "learning_rate": 6.635074654369286e-07, "loss": 0.0, "num_tokens": 140089387.0, "reward": 0.8828125, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1702 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.891756272401434, "grad_norm": 0.07986553586560578, "learning_rate": 6.624480818869806e-07, "loss": 0.0, "num_tokens": 140162675.0, "reward": 0.8203125, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1703 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.894623655913978, "grad_norm": 0.11206446301223659, "learning_rate": 6.613891257123652e-07, "loss": 0.0, "num_tokens": 140226686.0, "reward": 0.96875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1704 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.96265150658483e-09, "advantages/std": 0.4676017463207245, "advantages/var": 0.21865139316219118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 4.897491039426523, "grad_norm": 0.09464547910317243, "learning_rate": 6.603305982538294e-07, "loss": 0.0, "num_tokens": 140305406.0, "reward": 0.8671875, "reward_std": 0.11336849629878998, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1705 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380984365940094, "advantages/var": 0.05466704299203351, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.900358422939068, "grad_norm": 0.08937874680302588, "learning_rate": 6.592725008515773e-07, "loss": -0.0, "num_tokens": 140383480.0, "reward": 0.78125, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1706 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.958849501312727e-10, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.903225806451613, "grad_norm": 0.09765013745455495, "learning_rate": 6.582148348452699e-07, "loss": -0.0, "num_tokens": 140467072.0, "reward": 0.75, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1707 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.8996420859237135e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.9060931899641576, "grad_norm": 0.1222620846497803, "learning_rate": 6.571576015740191e-07, "loss": 0.0, "num_tokens": 140541660.0, "reward": 0.8984375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1708 }, { "advantages/mean": 6.51925802230835e-09, "advantages/snr": 1.3941743289762473e-08, "advantages/std": 0.4676070809364319, "advantages/var": 0.21865638214189076, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.908960573476703, "grad_norm": 0.09454060755806572, "learning_rate": 6.561008023763914e-07, "loss": 0.0, "num_tokens": 140625366.0, "reward": 0.8203125, "reward_std": 0.12019838392734528, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1709 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599614475511504e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.911827956989248, "grad_norm": 0.13555003931504256, "learning_rate": 6.550444385904032e-07, "loss": 0.0, "num_tokens": 140695054.0, "reward": 0.9375, "reward_std": 0.08785156905651093, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1710 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149914704966296e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 4.914695340501792, "grad_norm": 0.09516021365964764, "learning_rate": 6.539885115535186e-07, "loss": -0.0, "num_tokens": 140786979.0, "reward": 0.6875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 1711 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.917562724014337, "grad_norm": 0.0, "learning_rate": 6.529330226026506e-07, "loss": 0.0, "num_tokens": 140856971.0, "reward": 0.9375, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1712 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.1950488391937882e-08, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.920430107526881, "grad_norm": 0.10092452127505862, "learning_rate": 6.518779730741554e-07, "loss": 0.0, "num_tokens": 140936874.0, "reward": 0.828125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1713 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.923297491039427, "grad_norm": 0.04411808604439211, "learning_rate": 6.508233643038341e-07, "loss": -0.0, "num_tokens": 141012761.0, "reward": 0.921875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1714 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.926164874551971, "grad_norm": 0.0322422152411568, "learning_rate": 6.497691976269296e-07, "loss": -0.0, "num_tokens": 141097051.0, "reward": 0.859375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1715 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9876049603820392e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.929032258064516, "grad_norm": 0.09155685432631067, "learning_rate": 6.487154743781256e-07, "loss": 0.0, "num_tokens": 141179243.0, "reward": 0.8046875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1716 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.45367316198517e-09, "advantages/std": 0.5227833986282349, "advantages/var": 0.2733024818812879, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.931899641577061, "grad_norm": 0.10127201522358535, "learning_rate": 6.476621958915424e-07, "loss": 0.0, "num_tokens": 141256649.0, "reward": 0.7734375, "reward_std": 0.13098490238189697, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1717 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 4.934767025089606, "grad_norm": 0.05331845694815187, "learning_rate": 6.466093635007397e-07, "loss": 0.0, "num_tokens": 141335322.0, "reward": 0.796875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1718 }, { "advantages/mean": -7.916241884231567e-09, "advantages/snr": 1.5141863799394746e-08, "advantages/std": 0.5228049755096436, "advantages/var": 0.273325042417639, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.93763440860215, "grad_norm": 0.0997539260668846, "learning_rate": 6.455569785387105e-07, "loss": 0.0, "num_tokens": 141430219.0, "reward": 0.8359375, "reward_std": 0.1525501012802124, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1719 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.5995914574054795e-09, "advantages/std": 0.4049588143825531, "advantages/var": 0.1639916413461231, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.940501792114695, "grad_norm": 0.08453368071186579, "learning_rate": 6.44505042337883e-07, "loss": -0.0, "num_tokens": 141510843.0, "reward": 0.859375, "reward_std": 0.0867956355214119, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1720 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.94336917562724, "grad_norm": 0.04205308392745947, "learning_rate": 6.434535562301152e-07, "loss": 0.0, "num_tokens": 141583915.0, "reward": 0.796875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1721 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.946236559139785, "grad_norm": 0.11643484145917221, "learning_rate": 6.424025215466968e-07, "loss": 0.0, "num_tokens": 141655988.0, "reward": 0.9140625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1722 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.971078240891425e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.94910394265233, "grad_norm": 0.10430840916517461, "learning_rate": 6.413519396183455e-07, "loss": 0.0, "num_tokens": 141731014.0, "reward": 0.9140625, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1723 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.951971326164875, "grad_norm": 0.1523243992207566, "learning_rate": 6.40301811775206e-07, "loss": -0.0, "num_tokens": 141791103.0, "reward": 0.96875, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1724 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917335935919526e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.95483870967742, "grad_norm": 0.1217709933519384, "learning_rate": 6.392521393468471e-07, "loss": -0.0, "num_tokens": 141865535.0, "reward": 0.671875, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 1725 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.957706093189964, "grad_norm": 0.0736153775168493, "learning_rate": 6.382029236622617e-07, "loss": -0.0, "num_tokens": 141943036.0, "reward": 0.8125, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1726 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.4083154633446115e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 4.960573476702509, "grad_norm": 0.060291069484120795, "learning_rate": 6.371541660498651e-07, "loss": 0.0, "num_tokens": 142030491.0, "reward": 0.7421875, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1727 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917668534980524e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.963440860215054, "grad_norm": 0.11269864326468705, "learning_rate": 6.36105867837492e-07, "loss": 0.0, "num_tokens": 142116698.0, "reward": 0.8203125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1728 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.453692965541534e-09, "advantages/std": 0.5227810740470886, "advantages/var": 0.27330005138182756, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 4.966308243727599, "grad_norm": 0.14868033927371826, "learning_rate": 6.350580303523946e-07, "loss": 0.0, "num_tokens": 142195353.0, "reward": 0.890625, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1729 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878688398451923e-09, "advantages/std": 0.5726882815361023, "advantages/var": 0.32797186780877396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.969175627240143, "grad_norm": 0.13763173975271825, "learning_rate": 6.340106549212429e-07, "loss": -0.0, "num_tokens": 142275715.0, "reward": 0.8125, "reward_std": 0.1643974632024765, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1730 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.149940263022739e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.972043010752688, "grad_norm": 0.08337521404229342, "learning_rate": 6.329637428701218e-07, "loss": -0.0, "num_tokens": 142355509.0, "reward": 0.8828125, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1731 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227746963500977, "advantages/var": 0.2732933831439368, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.974910394265233, "grad_norm": 0.12437869174628784, "learning_rate": 6.319172955245293e-07, "loss": 0.0, "num_tokens": 142433421.0, "reward": 0.890625, "reward_std": 0.12179599702358246, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1732 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.977777777777778, "grad_norm": 0.04592323494933643, "learning_rate": 6.308713142093748e-07, "loss": 0.0, "num_tokens": 142510819.0, "reward": 0.7265625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 1733 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.749701315113695e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.980645161290322, "grad_norm": 0.13970535726888655, "learning_rate": 6.298258002489779e-07, "loss": -0.0, "num_tokens": 142594790.0, "reward": 0.8359375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1734 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344431558649841e-09, "advantages/std": 0.5227810740470886, "advantages/var": 0.27330005138182756, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 4.983512544802867, "grad_norm": 0.15244272723685726, "learning_rate": 6.287807549670663e-07, "loss": 0.0, "num_tokens": 142672168.0, "reward": 0.875, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1735 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.986379928315412, "grad_norm": 0.04776247984209465, "learning_rate": 6.27736179686775e-07, "loss": 0.0, "num_tokens": 142748771.0, "reward": 0.7734375, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1736 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.989247311827957, "grad_norm": 0.2015861481177434, "learning_rate": 6.266920757306429e-07, "loss": 0.0, "num_tokens": 142815438.0, "reward": 0.8984375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1737 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.992114695340502, "grad_norm": 0.054341266197759155, "learning_rate": 6.256484444206127e-07, "loss": 0.0, "num_tokens": 142895666.0, "reward": 0.90625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1738 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.4497441148988883e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 4.994982078853047, "grad_norm": 0.10739314990443329, "learning_rate": 6.246052870780287e-07, "loss": -0.0, "num_tokens": 142984818.0, "reward": 0.703125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 1739 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983628920516591e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 4.997849462365592, "grad_norm": 0.14704563705136212, "learning_rate": 6.235626050236355e-07, "loss": 0.0, "num_tokens": 143054398.0, "reward": 0.796875, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1740 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 5.002867383512545, "grad_norm": 0.051420375009002865, "learning_rate": 6.225203995775745e-07, "loss": 0.0, "num_tokens": 143136232.0, "reward": 0.734375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1741 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.9876049603820392e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.00573476702509, "grad_norm": 0.10426521127479207, "learning_rate": 6.214786720593853e-07, "loss": 0.0, "num_tokens": 143210767.0, "reward": 0.9296875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1742 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.4536284776845e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.008602150537635, "grad_norm": 0.14482718305467646, "learning_rate": 6.204374237880015e-07, "loss": 0.0, "num_tokens": 143292941.0, "reward": 0.890625, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1743 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 1.1266523706756892e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.011469534050179, "grad_norm": 0.059359201068545704, "learning_rate": 6.193966560817507e-07, "loss": -0.0, "num_tokens": 143365284.0, "reward": 0.9453125, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1744 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.014336917562724, "grad_norm": 0.058939128235451325, "learning_rate": 6.183563702583506e-07, "loss": 0.0, "num_tokens": 143442545.0, "reward": 0.8203125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1745 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.017204301075269, "grad_norm": 0.08724254557798017, "learning_rate": 6.173165676349102e-07, "loss": 0.0, "num_tokens": 143526805.0, "reward": 0.8515625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1746 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599658819865184e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.020071684587814, "grad_norm": 0.07225058271238646, "learning_rate": 6.162772495279264e-07, "loss": -0.0, "num_tokens": 143608375.0, "reward": 0.953125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 1747 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 7.966680494285092e-09, "advantages/std": 0.46760883927345276, "advantages/var": 0.21865802656666578, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 5.022939068100358, "grad_norm": 0.09030024864295509, "learning_rate": 6.152384172532819e-07, "loss": 0.0, "num_tokens": 143701698.0, "reward": 0.7265625, "reward_std": 0.11914245784282684, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 1748 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9916588263316927e-09, "advantages/std": 0.46761149168014526, "advantages/var": 0.21866050715133056, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.025806451612903, "grad_norm": 0.14284937884647111, "learning_rate": 6.142000721262458e-07, "loss": -0.0, "num_tokens": 143783579.0, "reward": 0.84375, "reward_std": 0.12255740165710449, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1749 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.028673835125448, "grad_norm": 0.0797696485104577, "learning_rate": 6.131622154614683e-07, "loss": 0.0, "num_tokens": 143864045.0, "reward": 0.8515625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1750 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.5050695213580615e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 5.031541218637993, "grad_norm": 0.1158966547359424, "learning_rate": 6.121248485729831e-07, "loss": 0.0, "num_tokens": 143950941.0, "reward": 0.7578125, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1751 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.034408602150537, "grad_norm": 0.05336124541650834, "learning_rate": 6.110879727742027e-07, "loss": 0.0, "num_tokens": 144022571.0, "reward": 0.7890625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1752 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599693686636838e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 5.037275985663083, "grad_norm": 0.06981000286252387, "learning_rate": 6.100515893779188e-07, "loss": 0.0, "num_tokens": 144105423.0, "reward": 0.8046875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1753 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.979341600636731e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.040143369175627, "grad_norm": 0.1363742168996095, "learning_rate": 6.09015699696298e-07, "loss": 0.0, "num_tokens": 144176229.0, "reward": 0.9296875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1754 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.043010752688172, "grad_norm": 0.04691676838152845, "learning_rate": 6.079803050408836e-07, "loss": -0.0, "num_tokens": 144247996.0, "reward": 0.9453125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1755 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.045878136200717, "grad_norm": 0.1530190523094471, "learning_rate": 6.06945406722591e-07, "loss": 0.0, "num_tokens": 144314751.0, "reward": 0.96875, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1756 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.9875164192348156e-09, "advantages/std": 0.4676070809364319, "advantages/var": 0.21865638214189076, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.048745519713262, "grad_norm": 0.09774635096609512, "learning_rate": 6.05911006051708e-07, "loss": -0.0, "num_tokens": 144398766.0, "reward": 0.6484375, "reward_std": 0.12019838392734528, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4793342351913452, "step": 1757 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 8.049402934764073e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 5.051612903225807, "grad_norm": 0.1040979028863591, "learning_rate": 6.048771043378911e-07, "loss": 0.0, "num_tokens": 144483471.0, "reward": 0.78125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1758 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 5.054480286738351, "grad_norm": 0.08131582776455688, "learning_rate": 6.038437028901666e-07, "loss": -0.0, "num_tokens": 144563587.0, "reward": 0.8125, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1759 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.057347670250896, "grad_norm": 0.05424336079130616, "learning_rate": 6.028108030169265e-07, "loss": -0.0, "num_tokens": 144642656.0, "reward": 0.8203125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1760 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 5.060215053763441, "grad_norm": 0.0, "learning_rate": 6.017784060259279e-07, "loss": 0.0, "num_tokens": 144715111.0, "reward": 0.8125, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1761 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 8.450247070110804e-09, "advantages/std": 0.661274790763855, "advantages/var": 0.4372843488997802, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.063082437275986, "grad_norm": 0.1660673494178795, "learning_rate": 6.00746513224291e-07, "loss": 0.0, "num_tokens": 144807200.0, "reward": 0.7890625, "reward_std": 0.20753081142902374, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1762 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975244195968941e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.06594982078853, "grad_norm": 0.18780836259720457, "learning_rate": 5.997151259184979e-07, "loss": 0.0, "num_tokens": 144884781.0, "reward": 0.765625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1763 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.2649061754629257e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 5.068817204301075, "grad_norm": 0.07968917981618698, "learning_rate": 5.98684245414391e-07, "loss": 0.0, "num_tokens": 144976402.0, "reward": 0.765625, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1764 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.07168458781362, "grad_norm": 0.11952386312560818, "learning_rate": 5.976538730171707e-07, "loss": -0.0, "num_tokens": 145052318.0, "reward": 0.8125, "reward_std": 0.10205792635679245, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1765 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.074551971326165, "grad_norm": 0.049016231424302455, "learning_rate": 5.966240100313937e-07, "loss": -0.0, "num_tokens": 145116981.0, "reward": 0.828125, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1766 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.077419354838709, "grad_norm": 0.048175777862949964, "learning_rate": 5.95594657760972e-07, "loss": 0.0, "num_tokens": 145197382.0, "reward": 0.90625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1767 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.053907636466336e-08, "advantages/std": 0.618579626083374, "advantages/var": 0.3826407538054468, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.080286738351255, "grad_norm": 0.14543054577741166, "learning_rate": 5.945658175091719e-07, "loss": 0.0, "num_tokens": 145276698.0, "reward": 0.84375, "reward_std": 0.2001592218875885, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1768 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.967257841033182e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.0831541218637994, "grad_norm": 0.10064476075676881, "learning_rate": 5.935374905786102e-07, "loss": 0.0, "num_tokens": 145353875.0, "reward": 0.90625, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1769 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.086021505376344, "grad_norm": 0.036890267935205494, "learning_rate": 5.925096782712538e-07, "loss": 0.0, "num_tokens": 145429444.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1770 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.088888888888889, "grad_norm": 0.12470157001679923, "learning_rate": 5.914823818884189e-07, "loss": -0.0, "num_tokens": 145507605.0, "reward": 0.8984375, "reward_std": 0.12415502220392227, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1771 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 5.091756272401434, "grad_norm": 0.05869326667095858, "learning_rate": 5.904556027307679e-07, "loss": 0.0, "num_tokens": 145587451.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1772 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.3942077395823529e-08, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.094623655913979, "grad_norm": 0.10432344011039471, "learning_rate": 5.894293420983089e-07, "loss": 0.0, "num_tokens": 145662900.0, "reward": 0.875, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1773 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907256955369e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 5.097491039426523, "grad_norm": 0.09054092079051783, "learning_rate": 5.884036012903921e-07, "loss": 0.0, "num_tokens": 145746956.0, "reward": 0.796875, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1774 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 6.899541037727662e-09, "advantages/std": 0.40494978427886963, "advantages/var": 0.16398432778750305, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.100358422939068, "grad_norm": 0.09423064128177898, "learning_rate": 5.873783816057114e-07, "loss": 0.0, "num_tokens": 145811403.0, "reward": 0.8671875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1775 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.1579341628386252e-08, "advantages/std": 0.5227928161621094, "advantages/var": 0.2733123286307091, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.103225806451613, "grad_norm": 0.12627040757335745, "learning_rate": 5.863536843422995e-07, "loss": 0.0, "num_tokens": 145896309.0, "reward": 0.7265625, "reward_std": 0.1412346363067627, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 1776 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.106093189964158, "grad_norm": 0.07948580732809564, "learning_rate": 5.853295107975289e-07, "loss": 0.0, "num_tokens": 145962996.0, "reward": 0.9453125, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1777 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125942055767658e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.108960573476702, "grad_norm": 0.15472698679819985, "learning_rate": 5.843058622681073e-07, "loss": 0.0, "num_tokens": 146049384.0, "reward": 0.8515625, "reward_std": 0.12415502220392227, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1778 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633244590331673e-09, "advantages/std": 0.33065226674079895, "advantages/var": 0.10933092150082846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.111827956989248, "grad_norm": 0.07359250655187671, "learning_rate": 5.832827400500794e-07, "loss": 0.0, "num_tokens": 146122420.0, "reward": 0.953125, "reward_std": 0.06233953312039375, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 1779 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 1.126665165824288e-08, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.114695340501792, "grad_norm": 0.08254274226738148, "learning_rate": 5.822601454388223e-07, "loss": 0.0, "num_tokens": 146200474.0, "reward": 0.84375, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1780 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.267469408556425e-08, "advantages/std": 0.330655038356781, "advantages/var": 0.10933275439072432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.117562724014337, "grad_norm": 0.0882255630075865, "learning_rate": 5.812380797290463e-07, "loss": 0.0, "num_tokens": 146276500.0, "reward": 0.828125, "reward_std": 0.0646936446428299, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1781 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199591840825068e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.120430107526881, "grad_norm": 0.10011928540464056, "learning_rate": 5.802165442147911e-07, "loss": -0.0, "num_tokens": 146353975.0, "reward": 0.96875, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1782 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599796258942519e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.123297491039427, "grad_norm": 0.12919290574892164, "learning_rate": 5.791955401894248e-07, "loss": -0.0, "num_tokens": 146427886.0, "reward": 0.921875, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1783 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.1261648745519715, "grad_norm": 0.10845132767806537, "learning_rate": 5.781750689456435e-07, "loss": 0.0, "num_tokens": 146494899.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1784 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 5.633244590331673e-09, "advantages/std": 0.33065226674079895, "advantages/var": 0.10933092150082846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 5.129032258064516, "grad_norm": 0.04768167934092357, "learning_rate": 5.771551317754691e-07, "loss": 0.0, "num_tokens": 146574156.0, "reward": 0.859375, "reward_std": 0.06233953312039375, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1785 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.131899641577061, "grad_norm": 0.025027818818730816, "learning_rate": 5.76135729970246e-07, "loss": 0.0, "num_tokens": 146644653.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1786 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4049627482891083, "advantages/var": 0.16399482750186767, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.134767025089606, "grad_norm": 0.08388678335748852, "learning_rate": 5.75116864820641e-07, "loss": -0.0, "num_tokens": 146725445.0, "reward": 0.84375, "reward_std": 0.09127141535282135, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1787 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.137634408602151, "grad_norm": 0.08288362155938514, "learning_rate": 5.740985376166422e-07, "loss": 0.0, "num_tokens": 146817818.0, "reward": 0.671875, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 1788 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.140501792114695, "grad_norm": 0.04871816815622585, "learning_rate": 5.730807496475567e-07, "loss": 0.0, "num_tokens": 146890478.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1789 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.14336917562724, "grad_norm": 0.08230015577181828, "learning_rate": 5.720635022020082e-07, "loss": 0.0, "num_tokens": 146968403.0, "reward": 0.7109375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 1790 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.970911630250105e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 5.146236559139785, "grad_norm": 0.07019564524774083, "learning_rate": 5.710467965679355e-07, "loss": -0.0, "num_tokens": 147051471.0, "reward": 0.859375, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1791 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.13136821560598e-09, "advantages/std": 0.5726727247238159, "advantages/var": 0.32795404964259944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.14910394265233, "grad_norm": 0.11210520406452633, "learning_rate": 5.700306340325931e-07, "loss": 0.0, "num_tokens": 147139169.0, "reward": 0.78125, "reward_std": 0.1462520956993103, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1792 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.151971326164874, "grad_norm": 0.10304077543283272, "learning_rate": 5.690150158825462e-07, "loss": 0.0, "num_tokens": 147215606.0, "reward": 0.9453125, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1793 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.15483870967742, "grad_norm": 0.06774208086141432, "learning_rate": 5.679999434036724e-07, "loss": 0.0, "num_tokens": 147283802.0, "reward": 0.8828125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1794 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.157706093189964, "grad_norm": 0.0, "learning_rate": 5.669854178811564e-07, "loss": 0.0, "num_tokens": 147348294.0, "reward": 0.9375, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1795 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983539800525091e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.160573476702509, "grad_norm": 0.10173609497794077, "learning_rate": 5.659714405994925e-07, "loss": 0.0, "num_tokens": 147425935.0, "reward": 0.828125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1796 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.0349462367204652e-08, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.163440860215053, "grad_norm": 0.12454392102126001, "learning_rate": 5.649580128424791e-07, "loss": 0.0, "num_tokens": 147505940.0, "reward": 0.8359375, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1797 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.2946288161655503e-08, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.166308243727599, "grad_norm": 0.08493938281601522, "learning_rate": 5.639451358932203e-07, "loss": 0.0, "num_tokens": 147594173.0, "reward": 0.8828125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1798 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.1691756272401435, "grad_norm": 0.0887236559371361, "learning_rate": 5.629328110341217e-07, "loss": 0.0, "num_tokens": 147673586.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 1799 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.2998246708054356e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 5.172043010752688, "grad_norm": 0.06787093086525699, "learning_rate": 5.619210395468907e-07, "loss": 0.0, "num_tokens": 147768016.0, "reward": 0.7890625, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1800 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.2946288161655503e-08, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.174910394265233, "grad_norm": 0.11292903222098369, "learning_rate": 5.609098227125333e-07, "loss": 0.0, "num_tokens": 147838620.0, "reward": 0.9453125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1801 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907256955369e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.177777777777778, "grad_norm": 0.11244689407496437, "learning_rate": 5.598991618113542e-07, "loss": -0.0, "num_tokens": 147924707.0, "reward": 0.875, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1802 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 1.1266652673742488e-08, "advantages/std": 0.33064746856689453, "advantages/var": 0.10932774846969551, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.180645161290323, "grad_norm": 0.07870096063958429, "learning_rate": 5.58889058122953e-07, "loss": 0.0, "num_tokens": 147992761.0, "reward": 0.90625, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1803 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.6721015324148586e-09, "advantages/std": 0.5228034257888794, "advantages/var": 0.2733234220165883, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.183512544802867, "grad_norm": 0.20088785350237562, "learning_rate": 5.578795129262254e-07, "loss": 0.0, "num_tokens": 148068237.0, "reward": 0.8203125, "reward_std": 0.15360605716705322, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1804 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306550085544586, "advantages/var": 0.1093327346821491, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 5.186379928315413, "grad_norm": 0.06858851307313002, "learning_rate": 5.568705274993584e-07, "loss": 0.0, "num_tokens": 148158495.0, "reward": 0.8125, "reward_std": 0.0646936446428299, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1805 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675827622413635, "advantages/var": 0.2186336395452635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.189247311827957, "grad_norm": 0.1523815068034485, "learning_rate": 5.558621031198317e-07, "loss": 0.0, "num_tokens": 148236114.0, "reward": 0.7890625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1806 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.987538125611118e-09, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.192114695340502, "grad_norm": 0.1202866833437615, "learning_rate": 5.548542410644132e-07, "loss": -0.0, "num_tokens": 148321911.0, "reward": 0.8984375, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1807 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.749390312251308e-09, "advantages/std": 0.4049657881259918, "advantages/var": 0.1639972895525057, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.194982078853046, "grad_norm": 0.09572978561770634, "learning_rate": 5.538469426091595e-07, "loss": 0.0, "num_tokens": 148397750.0, "reward": 0.8359375, "reward_std": 0.094686359167099, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1808 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299846843318419e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.197849462365592, "grad_norm": 0.09354703080827789, "learning_rate": 5.528402090294142e-07, "loss": 0.0, "num_tokens": 148471544.0, "reward": 0.8671875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1809 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983349132682101e-09, "advantages/std": 0.4676077961921692, "advantages/var": 0.21865705105969724, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.200716845878136, "grad_norm": 0.13539962414403606, "learning_rate": 5.518340415998055e-07, "loss": 0.0, "num_tokens": 148558072.0, "reward": 0.890625, "reward_std": 0.12125921249389648, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1810 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958501673983143e-09, "advantages/std": 0.4676017463207245, "advantages/var": 0.21865139316219118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.203584229390681, "grad_norm": 0.11131124567062191, "learning_rate": 5.508284415942441e-07, "loss": 0.0, "num_tokens": 148642583.0, "reward": 0.8046875, "reward_std": 0.11336849629878998, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1811 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125909557323754e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.2064516129032254, "grad_norm": 0.15026819642465544, "learning_rate": 5.498234102859222e-07, "loss": -0.0, "num_tokens": 148722128.0, "reward": 0.828125, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1812 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.209318996415771, "grad_norm": 0.06661326814815241, "learning_rate": 5.488189489473131e-07, "loss": 0.0, "num_tokens": 148799408.0, "reward": 0.9453125, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1813 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.991766726549734e-09, "advantages/std": 0.4675861597061157, "advantages/var": 0.21863681674871316, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.2121863799283155, "grad_norm": 0.10657950688635445, "learning_rate": 5.478150588501681e-07, "loss": 0.0, "num_tokens": 148871489.0, "reward": 0.9453125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1814 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299897960206267e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.21505376344086, "grad_norm": 0.11086951659826781, "learning_rate": 5.468117412655147e-07, "loss": 0.0, "num_tokens": 148942135.0, "reward": 0.875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1815 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380984365940094, "advantages/var": 0.05466704299203351, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.217921146953405, "grad_norm": 0.0813492842945334, "learning_rate": 5.458089974636551e-07, "loss": -0.0, "num_tokens": 149013589.0, "reward": 0.90625, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1816 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.22078853046595, "grad_norm": 0.09958757792398544, "learning_rate": 5.448068287141662e-07, "loss": 0.0, "num_tokens": 149087877.0, "reward": 0.8671875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1817 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899474012416308e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 5.223655913978495, "grad_norm": 0.08548723075463324, "learning_rate": 5.438052362858974e-07, "loss": -0.0, "num_tokens": 149180715.0, "reward": 0.8203125, "reward_std": 0.08443662524223328, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1818 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.226523297491039, "grad_norm": 0.12481555793055853, "learning_rate": 5.428042214469661e-07, "loss": 0.0, "num_tokens": 149272076.0, "reward": 0.8828125, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1819 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.229390681003585, "grad_norm": 0.12064713889761638, "learning_rate": 5.418037854647599e-07, "loss": 0.0, "num_tokens": 149354092.0, "reward": 0.765625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1820 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966813525430481e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.232258064516129, "grad_norm": 0.09892394825246684, "learning_rate": 5.408039296059334e-07, "loss": 0.0, "num_tokens": 149442991.0, "reward": 0.84375, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1821 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975144418133046e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.235125448028674, "grad_norm": 0.10791824615849031, "learning_rate": 5.398046551364078e-07, "loss": 0.0, "num_tokens": 149524910.0, "reward": 0.7890625, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1822 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 5.237992831541218, "grad_norm": 0.09244461993010375, "learning_rate": 5.388059633213651e-07, "loss": 0.0, "num_tokens": 149616419.0, "reward": 0.8515625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1823 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975144418133046e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.240860215053764, "grad_norm": 0.16256213728805313, "learning_rate": 5.378078554252523e-07, "loss": 0.0, "num_tokens": 149688072.0, "reward": 0.8515625, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1824 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.3941844586185219e-08, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 5.243727598566308, "grad_norm": 0.09845655874877357, "learning_rate": 5.368103327117768e-07, "loss": -0.0, "num_tokens": 149772056.0, "reward": 0.8046875, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1825 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.246594982078853, "grad_norm": 0.059794942562653566, "learning_rate": 5.35813396443904e-07, "loss": -0.0, "num_tokens": 149847887.0, "reward": 0.859375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1826 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.2494623655913975, "grad_norm": 0.15207452579939762, "learning_rate": 5.348170478838579e-07, "loss": 0.0, "num_tokens": 149932170.0, "reward": 0.921875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1827 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.2674839170101503e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.252329749103943, "grad_norm": 0.11597795702763866, "learning_rate": 5.338212882931172e-07, "loss": 0.0, "num_tokens": 150003031.0, "reward": 0.8359375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 1828 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 5.2551971326164875, "grad_norm": 0.0, "learning_rate": 5.328261189324166e-07, "loss": 0.0, "num_tokens": 150072094.0, "reward": 0.875, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1829 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917813257124117e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 5.258064516129032, "grad_norm": 0.13879677550315522, "learning_rate": 5.318315410617417e-07, "loss": -0.0, "num_tokens": 150148892.0, "reward": 0.8203125, "reward_std": 0.09522313624620438, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1830 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 9.199209994879892e-09, "advantages/std": 0.4049576222896576, "advantages/var": 0.16399067585049298, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 5.260931899641577, "grad_norm": 0.08015474128312423, "learning_rate": 5.308375559403306e-07, "loss": 0.0, "num_tokens": 150233854.0, "reward": 0.8046875, "reward_std": 0.08891239762306213, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1831 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983406762715241e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.263799283154122, "grad_norm": 0.1087398856557787, "learning_rate": 5.298441648266699e-07, "loss": 0.0, "num_tokens": 150312596.0, "reward": 0.828125, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1832 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 1.126665165824288e-08, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 5.266666666666667, "grad_norm": 0.03697671803596281, "learning_rate": 5.28851368978495e-07, "loss": 0.0, "num_tokens": 150406110.0, "reward": 0.78125, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1833 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.757495615940373e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.269534050179211, "grad_norm": 0.12998212949409493, "learning_rate": 5.278591696527868e-07, "loss": 0.0, "num_tokens": 150486513.0, "reward": 0.921875, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1834 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.272401433691757, "grad_norm": 0.07672766562987829, "learning_rate": 5.268675681057719e-07, "loss": 0.0, "num_tokens": 150553755.0, "reward": 0.7578125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1835 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.275268817204301, "grad_norm": 0.111884856419137, "learning_rate": 5.258765655929188e-07, "loss": -0.0, "num_tokens": 150637852.0, "reward": 0.859375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1836 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966859224177393e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.278136200716846, "grad_norm": 0.0874979299947862, "learning_rate": 5.248861633689391e-07, "loss": -0.0, "num_tokens": 150718790.0, "reward": 0.9140625, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1837 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899641578136434e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.28100358422939, "grad_norm": 0.1155783267271921, "learning_rate": 5.238963626877828e-07, "loss": -0.0, "num_tokens": 150798216.0, "reward": 0.9609375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 1838 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.283870967741936, "grad_norm": 0.08954806118932522, "learning_rate": 5.229071648026398e-07, "loss": 0.0, "num_tokens": 150883238.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1839 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814615465526806e-09, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.28673835125448, "grad_norm": 0.16870842199738778, "learning_rate": 5.219185709659354e-07, "loss": -0.0, "num_tokens": 150968520.0, "reward": 0.78125, "reward_std": 0.13098981976509094, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1840 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.749701315113695e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.289605734767025, "grad_norm": 0.09873181563072708, "learning_rate": 5.209305824293307e-07, "loss": 0.0, "num_tokens": 151054344.0, "reward": 0.9140625, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1841 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 7.52825480422812e-09, "advantages/std": 0.6185514330863953, "advantages/var": 0.3826058753732333, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.29247311827957, "grad_norm": 0.1421703163138954, "learning_rate": 5.199432004437205e-07, "loss": 0.0, "num_tokens": 151132679.0, "reward": 0.8046875, "reward_std": 0.1649293452501297, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1842 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380984365940094, "advantages/var": 0.05466704299203351, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.295340501792115, "grad_norm": 0.05798456668261622, "learning_rate": 5.189564262592326e-07, "loss": 0.0, "num_tokens": 151200553.0, "reward": 0.84375, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1843 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.2982078853046595, "grad_norm": 0.03668176632247106, "learning_rate": 5.179702611252231e-07, "loss": 0.0, "num_tokens": 151267289.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1844 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 5.301075268817204, "grad_norm": 0.1789702658219247, "learning_rate": 5.169847062902784e-07, "loss": 0.0, "num_tokens": 151355476.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1845 }, { "advantages/mean": 5.587935447692871e-09, "advantages/snr": 1.068867300569461e-08, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 5.3039426523297495, "grad_norm": 0.10527680576301442, "learning_rate": 5.159997630022119e-07, "loss": 0.0, "num_tokens": 151439451.0, "reward": 0.859375, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1846 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.9792225930357895e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.306810035842294, "grad_norm": 0.09277743427521075, "learning_rate": 5.150154325080636e-07, "loss": 0.0, "num_tokens": 151518552.0, "reward": 0.609375, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.4898075461387634, "step": 1847 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494468808174133, "advantages/var": 0.16398020040561878, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 5.309677419354839, "grad_norm": 0.10411630799642824, "learning_rate": 5.140317160540961e-07, "loss": 0.0, "num_tokens": 151603921.0, "reward": 0.828125, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1848 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.967079601050182e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.312544802867383, "grad_norm": 0.11719080496103142, "learning_rate": 5.130486148857951e-07, "loss": 0.0, "num_tokens": 151678722.0, "reward": 0.953125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 1849 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.672149362599044e-09, "advantages/std": 0.5227940678596497, "advantages/var": 0.27331363738923997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.315412186379929, "grad_norm": 0.1683067073965903, "learning_rate": 5.120661302478677e-07, "loss": -0.0, "num_tokens": 151756456.0, "reward": 0.8828125, "reward_std": 0.1433563083410263, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1850 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958683201273463e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.318279569892473, "grad_norm": 0.11269277479100119, "learning_rate": 5.110842633842405e-07, "loss": 0.0, "num_tokens": 151828491.0, "reward": 0.8828125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1851 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.494855892459685e-08, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.321146953405018, "grad_norm": 0.10160086799341961, "learning_rate": 5.101030155380575e-07, "loss": 0.0, "num_tokens": 151905489.0, "reward": 0.7578125, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1852 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 5.63344972278721e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.324014336917562, "grad_norm": 0.08345050577257829, "learning_rate": 5.091223879516784e-07, "loss": 0.0, "num_tokens": 151994257.0, "reward": 0.765625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1853 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.326881720430108, "grad_norm": 0.06111620190505675, "learning_rate": 5.081423818666787e-07, "loss": 0.0, "num_tokens": 152074009.0, "reward": 0.984375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 1854 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726876854896545, "advantages/var": 0.3279711851114975, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 5.329749103942652, "grad_norm": 0.19723566690935937, "learning_rate": 5.071629985238473e-07, "loss": -0.0, "num_tokens": 152160116.0, "reward": 0.7734375, "reward_std": 0.1633366346359253, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1855 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975244195968941e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 5.332616487455197, "grad_norm": 0.1322883239193507, "learning_rate": 5.061842391631826e-07, "loss": -0.0, "num_tokens": 152243787.0, "reward": 0.828125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1856 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.335483870967742, "grad_norm": 0.0616129656091537, "learning_rate": 5.05206105023895e-07, "loss": 0.0, "num_tokens": 152316961.0, "reward": 0.9453125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1857 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299811976811062e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.338351254480287, "grad_norm": 0.08578285901724589, "learning_rate": 5.042285973444027e-07, "loss": 0.0, "num_tokens": 152397505.0, "reward": 0.9296875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1858 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917811987622486e-09, "advantages/std": 0.4675827622413635, "advantages/var": 0.2186336395452635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.3412186379928315, "grad_norm": 0.10947534256692386, "learning_rate": 5.032517173623305e-07, "loss": 0.0, "num_tokens": 152466137.0, "reward": 0.8671875, "reward_std": 0.09522313624620438, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1859 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 5.344086021505376, "grad_norm": 0.08082531946198057, "learning_rate": 5.022754663145081e-07, "loss": 0.0, "num_tokens": 152551397.0, "reward": 0.7890625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1860 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998806953079044e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.3469534050179215, "grad_norm": 0.13423104771148225, "learning_rate": 5.0129984543697e-07, "loss": 0.0, "num_tokens": 152626323.0, "reward": 0.8515625, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1861 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380985856056213, "advantages/var": 0.05466704996011007, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.349820788530466, "grad_norm": 0.029017165456185587, "learning_rate": 5.003248559649525e-07, "loss": 0.0, "num_tokens": 152702387.0, "reward": 0.84375, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1862 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9875209889720355e-09, "advantages/std": 0.4676063656806946, "advantages/var": 0.21865571322510746, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.352688172043011, "grad_norm": 0.14866076394896222, "learning_rate": 4.993504991328913e-07, "loss": -0.0, "num_tokens": 152772860.0, "reward": 0.890625, "reward_std": 0.11913755536079407, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1863 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958683201273463e-10, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 5.355555555555555, "grad_norm": 0.13050036332563442, "learning_rate": 4.983767761744229e-07, "loss": 0.0, "num_tokens": 152839086.0, "reward": 0.8671875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1864 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.358422939068101, "grad_norm": 0.07580217828083215, "learning_rate": 4.974036883223798e-07, "loss": 0.0, "num_tokens": 152922350.0, "reward": 0.796875, "reward_std": 0.10205792635679245, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1865 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.252446745927492e-09, "advantages/std": 0.5726904273033142, "advantages/var": 0.3279743255248526, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.361290322580645, "grad_norm": 0.13115111983883432, "learning_rate": 4.964312368087915e-07, "loss": -0.0, "num_tokens": 152999796.0, "reward": 0.8046875, "reward_std": 0.16781240701675415, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1866 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.2523980013455208e-09, "advantages/std": 0.5726990103721619, "advantages/var": 0.32798415648125356, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 5.36415770609319, "grad_norm": 0.13322988909806766, "learning_rate": 4.954594228648806e-07, "loss": -0.0, "num_tokens": 153087722.0, "reward": 0.7734375, "reward_std": 0.17806214094161987, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1867 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.367025089605734, "grad_norm": 0.06590568814798663, "learning_rate": 4.944882477210641e-07, "loss": -0.0, "num_tokens": 153159296.0, "reward": 0.859375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1868 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962814881146116e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.36989247311828, "grad_norm": 0.10051890127853474, "learning_rate": 4.935177126069484e-07, "loss": -0.0, "num_tokens": 153240775.0, "reward": 0.7109375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 1869 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599521727490371e-09, "advantages/std": 0.40496495366096497, "advantages/var": 0.1639966136936275, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 5.372759856630824, "grad_norm": 0.09394611314446655, "learning_rate": 4.925478187513312e-07, "loss": -0.0, "num_tokens": 153325009.0, "reward": 0.734375, "reward_std": 0.0936255231499672, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1870 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.375627240143369, "grad_norm": 0.08158773125655303, "learning_rate": 4.91578567382197e-07, "loss": 0.0, "num_tokens": 153405905.0, "reward": 0.8125, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1871 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958683201273463e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 5.378494623655914, "grad_norm": 0.11164679408804816, "learning_rate": 4.906099597267177e-07, "loss": 0.0, "num_tokens": 153483337.0, "reward": 0.8203125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1872 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.033819584201132e-09, "advantages/std": 0.618557333946228, "advantages/var": 0.38261317537866546, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.381362007168459, "grad_norm": 0.16164954065256135, "learning_rate": 4.896419970112499e-07, "loss": 0.0, "num_tokens": 153557165.0, "reward": 0.921875, "reward_std": 0.17176413536071777, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1873 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.379866977655094e-08, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.3842293906810035, "grad_norm": 0.16709996664347315, "learning_rate": 4.886746804613332e-07, "loss": 0.0, "num_tokens": 153641555.0, "reward": 0.8828125, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1874 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.387096774193548, "grad_norm": 0.05553338294295206, "learning_rate": 4.877080113016897e-07, "loss": 0.0, "num_tokens": 153708630.0, "reward": 0.921875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1875 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.3899641577060935, "grad_norm": 0.08450687755165441, "learning_rate": 4.867419907562222e-07, "loss": -0.0, "num_tokens": 153780975.0, "reward": 0.7421875, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1876 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 6.899541037727662e-09, "advantages/std": 0.40494978427886963, "advantages/var": 0.16398432778750305, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.392831541218638, "grad_norm": 0.12972421739965487, "learning_rate": 4.857766200480115e-07, "loss": 0.0, "num_tokens": 153857618.0, "reward": 0.8671875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1877 }, { "advantages/mean": 6.05359673500061e-09, "advantages/snr": 1.494855892459685e-08, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.395698924731183, "grad_norm": 0.13205023635736984, "learning_rate": 4.848119003993151e-07, "loss": -0.0, "num_tokens": 153937816.0, "reward": 0.8671875, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1878 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199317639730369e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 5.398566308243727, "grad_norm": 0.07242213255933978, "learning_rate": 4.838478330315677e-07, "loss": 0.0, "num_tokens": 154010870.0, "reward": 0.921875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1879 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 5.401433691756273, "grad_norm": 0.023759560310194276, "learning_rate": 4.828844191653776e-07, "loss": -0.0, "num_tokens": 154085353.0, "reward": 0.9453125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1880 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.404301075268817, "grad_norm": 0.07006989525161213, "learning_rate": 4.819216600205254e-07, "loss": 0.0, "num_tokens": 154166297.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1881 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.407168458781362, "grad_norm": 0.08724927934268048, "learning_rate": 4.809595568159622e-07, "loss": -0.0, "num_tokens": 154248448.0, "reward": 0.7734375, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1882 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 5.410035842293907, "grad_norm": 0.03765949863075639, "learning_rate": 4.799981107698097e-07, "loss": 0.0, "num_tokens": 154330364.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1883 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 7.317935107621051e-09, "advantages/std": 0.5726959109306335, "advantages/var": 0.32798060639666815, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 5.412903225806452, "grad_norm": 0.1071459123786889, "learning_rate": 4.790373230993578e-07, "loss": -0.0, "num_tokens": 154416627.0, "reward": 0.8046875, "reward_std": 0.17676395177841187, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1884 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.415770609318996, "grad_norm": 0.10068983348235683, "learning_rate": 4.780771950210615e-07, "loss": 0.0, "num_tokens": 154498074.0, "reward": 0.734375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1885 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.0689055673126508e-08, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.418637992831541, "grad_norm": 0.11209554967802351, "learning_rate": 4.771177277505412e-07, "loss": 0.0, "num_tokens": 154583254.0, "reward": 0.953125, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 1886 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814855139419146e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.421505376344086, "grad_norm": 0.14202464929928846, "learning_rate": 4.761589225025811e-07, "loss": -0.0, "num_tokens": 154665590.0, "reward": 0.8828125, "reward_std": 0.12415502220392227, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1887 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 5.424372759856631, "grad_norm": 0.1849833455302111, "learning_rate": 4.7520078049112764e-07, "loss": -0.0, "num_tokens": 154742173.0, "reward": 0.921875, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 1888 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.4272401433691755, "grad_norm": 0.09493768489189344, "learning_rate": 4.742433029292855e-07, "loss": 0.0, "num_tokens": 154824204.0, "reward": 0.90625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1889 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917480653229804e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.43010752688172, "grad_norm": 0.10117158992485455, "learning_rate": 4.7328649102932005e-07, "loss": 0.0, "num_tokens": 154916159.0, "reward": 0.8125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1890 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917366402546925e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.4329749103942655, "grad_norm": 0.11939070460326162, "learning_rate": 4.7233034600265373e-07, "loss": -0.0, "num_tokens": 154990673.0, "reward": 0.8828125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1891 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.505166341645741e-09, "advantages/std": 0.5726664066314697, "advantages/var": 0.32794681328419983, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.43584229390681, "grad_norm": 0.23549992551626983, "learning_rate": 4.713748690598637e-07, "loss": 0.0, "num_tokens": 155062263.0, "reward": 0.8828125, "reward_std": 0.13941732048988342, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1892 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917669804463866e-09, "advantages/std": 0.46758610010147095, "advantages/var": 0.2186367610081028, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.438709677419355, "grad_norm": 0.10053341096553804, "learning_rate": 4.7042006141068123e-07, "loss": 0.0, "num_tokens": 155139301.0, "reward": 0.9453125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1893 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975041977944071e-09, "advantages/std": 0.4676063656806946, "advantages/var": 0.21865571322510746, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.4415770609319, "grad_norm": 0.10090695810415312, "learning_rate": 4.6946592426399134e-07, "loss": -0.0, "num_tokens": 155218436.0, "reward": 0.859375, "reward_std": 0.11913755536079407, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1894 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814647963303764e-09, "advantages/std": 0.5227847099304199, "advantages/var": 0.2733038529370333, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.444444444444445, "grad_norm": 0.23038553513838003, "learning_rate": 4.685124588278296e-07, "loss": -0.0, "num_tokens": 155297595.0, "reward": 0.8671875, "reward_std": 0.1331065595149994, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1895 }, { "advantages/mean": -7.916241884231567e-09, "advantages/snr": 1.6929196050355823e-08, "advantages/std": 0.46760883927345276, "advantages/var": 0.21865802656666578, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.447311827956989, "grad_norm": 0.0979355768194412, "learning_rate": 4.6755966630938084e-07, "loss": 0.0, "num_tokens": 155376489.0, "reward": 0.8671875, "reward_std": 0.11914245784282684, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1896 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.450179211469534, "grad_norm": 0.0, "learning_rate": 4.6660754791497745e-07, "loss": 0.0, "num_tokens": 155445844.0, "reward": 1.0, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 1.0, "rewards/drgrpo_math_reward/std": 0.0, "step": 1897 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.453046594982079, "grad_norm": 0.0725629879509453, "learning_rate": 4.6565610485009953e-07, "loss": 0.0, "num_tokens": 155526491.0, "reward": 0.9296875, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1898 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.4251692372421445e-08, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.455913978494624, "grad_norm": 0.12173701527788272, "learning_rate": 4.6470533831937167e-07, "loss": 0.0, "num_tokens": 155604914.0, "reward": 0.890625, "reward_std": 0.13098981976509094, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1899 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5629234993268783e-09, "advantages/std": 0.5227856040000916, "advantages/var": 0.27330478774974054, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.458781362007168, "grad_norm": 0.11586136349616745, "learning_rate": 4.637552495265616e-07, "loss": 0.0, "num_tokens": 155683803.0, "reward": 0.9375, "reward_std": 0.13098981976509094, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 1900 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.461648745519713, "grad_norm": 0.0, "learning_rate": 4.628058396745786e-07, "loss": 0.0, "num_tokens": 155753711.0, "reward": 0.875, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1901 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.979214976743015e-09, "advantages/std": 0.4676051139831543, "advantages/var": 0.21865454262319872, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.464516129032258, "grad_norm": 0.1331306280504158, "learning_rate": 4.6185710996547343e-07, "loss": -0.0, "num_tokens": 155832649.0, "reward": 0.8046875, "reward_std": 0.11784426867961884, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 1902 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.467383512544803, "grad_norm": 0.09456138483613138, "learning_rate": 4.609090616004354e-07, "loss": 0.0, "num_tokens": 155908172.0, "reward": 0.953125, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 1903 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958278899535023e-10, "advantages/std": 0.46761220693588257, "advantages/var": 0.21866117607544666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 5.4702508960573475, "grad_norm": 0.09422582550936447, "learning_rate": 4.599616957797903e-07, "loss": 0.0, "num_tokens": 155987999.0, "reward": 0.7734375, "reward_std": 0.1236182376742363, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1904 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.149940263022739e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.473118279569892, "grad_norm": 0.10386660888721715, "learning_rate": 4.590150137030009e-07, "loss": 0.0, "num_tokens": 156085665.0, "reward": 0.8828125, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1905 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5628065113579712e-09, "advantages/std": 0.5228027701377869, "advantages/var": 0.2733227364637436, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 5.4759856630824375, "grad_norm": 0.16402560566274624, "learning_rate": 4.5806901656866357e-07, "loss": 0.0, "num_tokens": 156165779.0, "reward": 0.8125, "reward_std": 0.15254521369934082, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1906 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.6721801332404694e-09, "advantages/std": 0.5227880477905273, "advantages/var": 0.2733073429126307, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.478853046594982, "grad_norm": 0.11023671658190864, "learning_rate": 4.571237055745073e-07, "loss": -0.0, "num_tokens": 156251407.0, "reward": 0.8203125, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1907 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 5.481720430107527, "grad_norm": 0.06128699635907892, "learning_rate": 4.5617908191739296e-07, "loss": 0.0, "num_tokens": 156320924.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1908 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.484587813620072, "grad_norm": 0.04613452362847414, "learning_rate": 4.5523514679331143e-07, "loss": 0.0, "num_tokens": 156400228.0, "reward": 0.9453125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 1909 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.967258348850795e-09, "advantages/std": 0.46757492423057556, "advantages/var": 0.21862630976922848, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.487455197132617, "grad_norm": 0.07612845603329545, "learning_rate": 4.5429190139738084e-07, "loss": 0.0, "num_tokens": 156477372.0, "reward": 0.90625, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1910 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.490322580645161, "grad_norm": 0.058345033045249815, "learning_rate": 4.533493469238464e-07, "loss": -0.0, "num_tokens": 156560690.0, "reward": 0.734375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1911 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.493189964157706, "grad_norm": 0.06742502881837097, "learning_rate": 4.524074845660788e-07, "loss": 0.0, "num_tokens": 156642607.0, "reward": 0.90625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1912 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633244590331673e-09, "advantages/std": 0.33065226674079895, "advantages/var": 0.10933092150082846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.496057347670251, "grad_norm": 0.06443206846731298, "learning_rate": 4.51466315516573e-07, "loss": -0.0, "num_tokens": 156731575.0, "reward": 0.890625, "reward_std": 0.06233953312039375, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1913 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149914704966296e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 5.498924731182796, "grad_norm": 0.09620239765164847, "learning_rate": 4.505258409669449e-07, "loss": 0.0, "num_tokens": 156818167.0, "reward": 0.625, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 1914 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.50179211469534, "grad_norm": 0.0, "learning_rate": 4.495860621079315e-07, "loss": 0.0, "num_tokens": 156895505.0, "reward": 0.875, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1915 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.1498668924479387e-09, "advantages/std": 0.404969722032547, "advantages/var": 0.16400047576311838, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.504659498207886, "grad_norm": 0.11654146819249533, "learning_rate": 4.486469801293893e-07, "loss": -0.0, "num_tokens": 156973942.0, "reward": 0.7109375, "reward_std": 0.09916213154792786, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 1916 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.50752688172043, "grad_norm": 0.0464231644339536, "learning_rate": 4.47708596220293e-07, "loss": 0.0, "num_tokens": 157046572.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 1917 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.510394265232975, "grad_norm": 0.11270588447924891, "learning_rate": 4.467709115687324e-07, "loss": 0.0, "num_tokens": 157111547.0, "reward": 0.9765625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9765625, "rewards/drgrpo_math_reward/std": 0.15188287198543549, "step": 1918 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.97501037071382e-09, "advantages/std": 0.46760883927345276, "advantages/var": 0.21865802656666578, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 5.5132616487455195, "grad_norm": 0.11110330877431249, "learning_rate": 4.4583392736191184e-07, "loss": -0.0, "num_tokens": 157197714.0, "reward": 0.7734375, "reward_std": 0.11914245784282684, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1919 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.7495735248314805e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.516129032258064, "grad_norm": 0.09720678227725901, "learning_rate": 4.448976447861499e-07, "loss": 0.0, "num_tokens": 157285191.0, "reward": 0.828125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1920 }, { "advantages/mean": 7.916241884231567e-09, "advantages/snr": 1.6929356816321683e-08, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.5189964157706095, "grad_norm": 0.14978010239675016, "learning_rate": 4.4396206502687703e-07, "loss": 0.0, "num_tokens": 157363001.0, "reward": 0.75, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1921 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.521863799283154, "grad_norm": 0.08382491164784144, "learning_rate": 4.430271892686317e-07, "loss": 0.0, "num_tokens": 157440871.0, "reward": 0.90625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1922 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380985856056213, "advantages/var": 0.05466704996011007, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.524731182795699, "grad_norm": 0.07748904121356069, "learning_rate": 4.420930186950631e-07, "loss": -0.0, "num_tokens": 157515620.0, "reward": 0.84375, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1923 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.967124795048994e-09, "advantages/std": 0.4675827622413635, "advantages/var": 0.2186336395452635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.527598566308244, "grad_norm": 0.1088886868490235, "learning_rate": 4.4115955448892725e-07, "loss": 0.0, "num_tokens": 157588147.0, "reward": 0.8515625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1924 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 5.530465949820789, "grad_norm": 0.06511628825323257, "learning_rate": 4.402267978320854e-07, "loss": -0.0, "num_tokens": 157676182.0, "reward": 0.8515625, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1925 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.533333333333333, "grad_norm": 0.10998579681902464, "learning_rate": 4.392947499055024e-07, "loss": 0.0, "num_tokens": 157746767.0, "reward": 0.796875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1926 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5055872703345218e-09, "advantages/std": 0.6185775995254517, "advantages/var": 0.38263824663467005, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 5.536200716845878, "grad_norm": 0.1398529103422478, "learning_rate": 4.383634118892472e-07, "loss": 0.0, "num_tokens": 157838953.0, "reward": 0.7421875, "reward_std": 0.19674429297447205, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1927 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.539068100358423, "grad_norm": 0.0903178064579871, "learning_rate": 4.3743278496248926e-07, "loss": 0.0, "num_tokens": 157907395.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1928 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599512249801046e-09, "advantages/std": 0.4049657881259918, "advantages/var": 0.1639972895525057, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.541935483870968, "grad_norm": 0.0894662363886373, "learning_rate": 4.365028703034975e-07, "loss": 0.0, "num_tokens": 157981381.0, "reward": 0.9296875, "reward_std": 0.094686359167099, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1929 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.2649061754629257e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.544802867383512, "grad_norm": 0.11168755722385075, "learning_rate": 4.355736690896389e-07, "loss": 0.0, "num_tokens": 158062896.0, "reward": 0.828125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 1930 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 5.547670250896058, "grad_norm": 0.0, "learning_rate": 4.3464518249737757e-07, "loss": 0.0, "num_tokens": 158146371.0, "reward": 0.75, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1931 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.550537634408602, "grad_norm": 0.06435845077798222, "learning_rate": 4.337174117022733e-07, "loss": -0.0, "num_tokens": 158242393.0, "reward": 0.765625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1932 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.0539321109852954e-08, "advantages/std": 0.6185652613639832, "advantages/var": 0.3826229825662928, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 5.553405017921147, "grad_norm": 0.13040252276106876, "learning_rate": 4.3279035787897845e-07, "loss": -0.0, "num_tokens": 158336544.0, "reward": 0.59375, "reward_std": 0.1820138692855835, "rewards/drgrpo_math_reward/mean": 0.59375, "rewards/drgrpo_math_reward/std": 0.4930621087551117, "step": 1933 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 5.633261853378446e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 5.5562724014336915, "grad_norm": 0.03814541076256376, "learning_rate": 4.3186402220123807e-07, "loss": -0.0, "num_tokens": 158422133.0, "reward": 0.8203125, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1934 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899474012416308e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.559139784946236, "grad_norm": 0.12475499796193422, "learning_rate": 4.3093840584188834e-07, "loss": 0.0, "num_tokens": 158495357.0, "reward": 0.8828125, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1935 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.5620071684587815, "grad_norm": 0.11435696114547454, "learning_rate": 4.300135099728549e-07, "loss": 0.0, "num_tokens": 158571381.0, "reward": 0.890625, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1936 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9876049603820392e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.564874551971326, "grad_norm": 0.14605670522014397, "learning_rate": 4.290893357651502e-07, "loss": -0.0, "num_tokens": 158644594.0, "reward": 0.8984375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1937 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917668534980524e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.567741935483871, "grad_norm": 1.6022086039464418, "learning_rate": 4.2816588438887336e-07, "loss": 0.0, "num_tokens": 158717798.0, "reward": 0.8984375, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1938 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344527836563254e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.570609318996416, "grad_norm": 0.1015551597876325, "learning_rate": 4.2724315701320913e-07, "loss": 0.0, "num_tokens": 158799021.0, "reward": 0.734375, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 1939 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.573476702508961, "grad_norm": 0.05159154932555391, "learning_rate": 4.2632115480642415e-07, "loss": -0.0, "num_tokens": 158874489.0, "reward": 0.7109375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 1940 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.907335152797101e-10, "advantages/std": 0.5227840542793274, "advantages/var": 0.27330316740873073, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.576344086021505, "grad_norm": 0.11100900508926137, "learning_rate": 4.2539987893586825e-07, "loss": -0.0, "num_tokens": 158965012.0, "reward": 0.71875, "reward_std": 0.13204574584960938, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 1941 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.4251629002153753e-08, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 5.57921146953405, "grad_norm": 0.10140498510511371, "learning_rate": 4.244793305679715e-07, "loss": -0.0, "num_tokens": 159049841.0, "reward": 0.8203125, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1942 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.0689055673126508e-08, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.582078853046595, "grad_norm": 0.1923077422965147, "learning_rate": 4.2355951086824195e-07, "loss": 0.0, "num_tokens": 159119943.0, "reward": 0.890625, "reward_std": 0.1173202320933342, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1943 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.8788204215685765e-09, "advantages/std": 0.5726727843284607, "advantages/var": 0.32795411791051166, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.58494623655914, "grad_norm": 0.11581234230628372, "learning_rate": 4.226404210012654e-07, "loss": -0.0, "num_tokens": 159202580.0, "reward": 0.75, "reward_std": 0.1462520956993103, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 1944 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.4083154633446115e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.587813620071684, "grad_norm": 0.05502996160027891, "learning_rate": 4.217220621307043e-07, "loss": 0.0, "num_tokens": 159282107.0, "reward": 0.7421875, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 1945 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227847099304199, "advantages/var": 0.2733038529370333, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 5.59068100358423, "grad_norm": 0.1209754534041952, "learning_rate": 4.2080443541929534e-07, "loss": 0.0, "num_tokens": 159375869.0, "reward": 0.7890625, "reward_std": 0.13310657441616058, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1946 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.4497441148988883e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.593548387096774, "grad_norm": 0.10984024628788239, "learning_rate": 4.198875420288477e-07, "loss": -0.0, "num_tokens": 159453028.0, "reward": 0.875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1947 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.596415770609319, "grad_norm": 0.11507229836869538, "learning_rate": 4.189713831202419e-07, "loss": 0.0, "num_tokens": 159535671.0, "reward": 0.7890625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1948 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.5992831541218635, "grad_norm": 0.1137030854495678, "learning_rate": 4.1805595985342967e-07, "loss": 0.0, "num_tokens": 159620953.0, "reward": 0.9765625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9765625, "rewards/drgrpo_math_reward/std": 0.15188287198543549, "step": 1949 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907227504745508e-10, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.602150537634409, "grad_norm": 0.0914234940800283, "learning_rate": 4.1714127338743086e-07, "loss": 0.0, "num_tokens": 159697331.0, "reward": 0.90625, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 1950 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149914704966296e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 5.6050179211469535, "grad_norm": 0.14375288366487707, "learning_rate": 4.162273248803322e-07, "loss": -0.0, "num_tokens": 159789361.0, "reward": 0.875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1951 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.607885304659498, "grad_norm": 0.07067283524310007, "learning_rate": 4.1531411548928554e-07, "loss": 0.0, "num_tokens": 159854093.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 1952 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.781453422160313e-09, "advantages/std": 0.5227880477905273, "advantages/var": 0.2733073429126307, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 5.610752688172043, "grad_norm": 0.10935663458432411, "learning_rate": 4.14401646370508e-07, "loss": -0.0, "num_tokens": 159945699.0, "reward": 0.8515625, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1953 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299807237755752e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 5.613620071684588, "grad_norm": 0.1492686257219773, "learning_rate": 4.1348991867927987e-07, "loss": 0.0, "num_tokens": 160038116.0, "reward": 0.765625, "reward_std": 0.08785156905651093, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1954 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125814501076877e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.616487455197133, "grad_norm": 0.2273844241640259, "learning_rate": 4.1257893356994036e-07, "loss": 0.0, "num_tokens": 160133466.0, "reward": 0.7734375, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1955 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983467187183905e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 5.619354838709677, "grad_norm": 0.089889769658418, "learning_rate": 4.116686921958907e-07, "loss": -0.0, "num_tokens": 160213627.0, "reward": 0.8125, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1956 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.7495735248314805e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.622222222222222, "grad_norm": 0.11244164019978824, "learning_rate": 4.1075919570959026e-07, "loss": 0.0, "num_tokens": 160289304.0, "reward": 0.84375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1957 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.2945998544314845e-08, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.625089605734767, "grad_norm": 0.09884065280090722, "learning_rate": 4.098504452625544e-07, "loss": 0.0, "num_tokens": 160368798.0, "reward": 0.9296875, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1958 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562855255411359e-09, "advantages/std": 0.5227956175804138, "advantages/var": 0.2733152577612863, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.627956989247312, "grad_norm": 0.12041629172262175, "learning_rate": 4.08942442005354e-07, "loss": -0.0, "num_tokens": 160457466.0, "reward": 0.8515625, "reward_std": 0.14230038225650787, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1959 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.630824372759856, "grad_norm": 0.11398688046581287, "learning_rate": 4.0803518708761455e-07, "loss": 0.0, "num_tokens": 160543054.0, "reward": 0.859375, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1960 }, { "advantages/mean": 8.847564458847046e-09, "advantages/snr": 1.692320165841484e-08, "advantages/std": 0.5228067636489868, "advantages/var": 0.27332691211712756, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 5.633691756272402, "grad_norm": 0.08094169219006273, "learning_rate": 4.0712868165801416e-07, "loss": -0.0, "num_tokens": 160636921.0, "reward": 0.7890625, "reward_std": 0.15490421652793884, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 1961 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.63344972278721e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.636559139784946, "grad_norm": 0.10298150500513412, "learning_rate": 4.0622292686428136e-07, "loss": 0.0, "num_tokens": 160710063.0, "reward": 0.84375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1962 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.639426523297491, "grad_norm": 0.10455267483184219, "learning_rate": 4.053179238531943e-07, "loss": 0.0, "num_tokens": 160782092.0, "reward": 0.9765625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9765625, "rewards/drgrpo_math_reward/std": 0.15188287198543549, "step": 1963 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.6422939068100355, "grad_norm": 0.08218972282732463, "learning_rate": 4.044136737705797e-07, "loss": -0.0, "num_tokens": 160848771.0, "reward": 0.7734375, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1964 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262250655110813e-09, "advantages/std": 0.5726898312568665, "advantages/var": 0.3279736428250182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.645161290322581, "grad_norm": 0.13816684884236538, "learning_rate": 4.0351017776131125e-07, "loss": 0.0, "num_tokens": 160925428.0, "reward": 0.890625, "reward_std": 0.16675157845020294, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1965 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299846843318419e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.6480286738351255, "grad_norm": 0.12260067352969628, "learning_rate": 4.0260743696930733e-07, "loss": 0.0, "num_tokens": 160998889.0, "reward": 0.9296875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 1966 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.65089605734767, "grad_norm": 0.04081103779018167, "learning_rate": 4.0170545253752984e-07, "loss": 0.0, "num_tokens": 161071658.0, "reward": 0.8515625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1967 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 6.8994882297977766e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.6537634408602155, "grad_norm": 0.08007249811362589, "learning_rate": 4.00804225607984e-07, "loss": -0.0, "num_tokens": 161147839.0, "reward": 0.78125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1968 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.449710856633628e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.65663082437276, "grad_norm": 0.07698763572693008, "learning_rate": 3.9990375732171566e-07, "loss": -0.0, "num_tokens": 161230330.0, "reward": 0.765625, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1969 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.659498207885305, "grad_norm": 0.08224400664250893, "learning_rate": 3.990040488188099e-07, "loss": 0.0, "num_tokens": 161317384.0, "reward": 0.796875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1970 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 9.786419100711427e-09, "advantages/std": 0.6185711622238159, "advantages/var": 0.3826302827349224, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.662365591397849, "grad_norm": 0.14909888159282425, "learning_rate": 3.9810510123838924e-07, "loss": 0.0, "num_tokens": 161397346.0, "reward": 0.8671875, "reward_std": 0.18884867429733276, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1971 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.1950488391937882e-08, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.665232974910394, "grad_norm": 0.13392262012664546, "learning_rate": 3.972069157186144e-07, "loss": -0.0, "num_tokens": 161476175.0, "reward": 0.890625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1972 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.668100358422939, "grad_norm": 0.12102725236303907, "learning_rate": 3.963094933966796e-07, "loss": 0.0, "num_tokens": 161550833.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1973 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016504754270957e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.670967741935484, "grad_norm": 0.14141676697189273, "learning_rate": 3.954128354088142e-07, "loss": 0.0, "num_tokens": 161634846.0, "reward": 0.859375, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 1974 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.673835125448028, "grad_norm": 0.05737776158606781, "learning_rate": 3.9451694289027836e-07, "loss": -0.0, "num_tokens": 161725619.0, "reward": 0.8203125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 1975 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 8.049325332145131e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.676702508960574, "grad_norm": 0.09190431122668229, "learning_rate": 3.9362181697536466e-07, "loss": -0.0, "num_tokens": 161795519.0, "reward": 0.890625, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 1976 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.679569892473118, "grad_norm": 0.0656476789024684, "learning_rate": 3.927274587973934e-07, "loss": 0.0, "num_tokens": 161867133.0, "reward": 0.96875, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 1977 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.682437275985663, "grad_norm": 0.09649804638539027, "learning_rate": 3.9183386948871465e-07, "loss": -0.0, "num_tokens": 161954954.0, "reward": 0.71875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 1978 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.6853046594982075, "grad_norm": 0.1569984779356773, "learning_rate": 3.9094105018070323e-07, "loss": 0.0, "num_tokens": 162035986.0, "reward": 0.8515625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 1979 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.4083154633446115e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.688172043010753, "grad_norm": 0.043728538486143304, "learning_rate": 3.900490020037607e-07, "loss": 0.0, "num_tokens": 162109285.0, "reward": 0.8671875, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1980 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 5.6910394265232975, "grad_norm": 0.08725815249044895, "learning_rate": 3.8915772608731055e-07, "loss": -0.0, "num_tokens": 162182941.0, "reward": 0.8125, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1981 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.693906810035842, "grad_norm": 0.09439298982117392, "learning_rate": 3.882672235598002e-07, "loss": 0.0, "num_tokens": 162249765.0, "reward": 0.9140625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 1982 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.6967741935483875, "grad_norm": 0.11582483962707424, "learning_rate": 3.8737749554869723e-07, "loss": 0.0, "num_tokens": 162320985.0, "reward": 0.7578125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 1983 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 5.699641577060932, "grad_norm": 0.07012001279124831, "learning_rate": 3.864885431804882e-07, "loss": 0.0, "num_tokens": 162413913.0, "reward": 0.765625, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 1984 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899641578136434e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.702508960573477, "grad_norm": 0.07596301689513589, "learning_rate": 3.856003675806776e-07, "loss": 0.0, "num_tokens": 162497532.0, "reward": 0.8984375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1985 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.041475770542322e-09, "advantages/std": 0.3306560218334198, "advantages/var": 0.10933340477470299, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.705376344086021, "grad_norm": 0.07054761515917433, "learning_rate": 3.847129698737872e-07, "loss": 0.0, "num_tokens": 162576663.0, "reward": 0.8828125, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 1986 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975310081633805e-09, "advantages/std": 0.46758538484573364, "advantages/var": 0.21863609212133284, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 5.708243727598567, "grad_norm": 0.09447728328744381, "learning_rate": 3.8382635118335417e-07, "loss": 0.0, "num_tokens": 162653726.0, "reward": 0.78125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1987 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6721682514236524e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.711111111111111, "grad_norm": 0.1489684627009545, "learning_rate": 3.8294051263192715e-07, "loss": 0.0, "num_tokens": 162729368.0, "reward": 0.78125, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 1988 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.34425851086805e-09, "advantages/std": 0.5227980017662048, "advantages/var": 0.2733177506507367, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 5.713978494623656, "grad_norm": 0.13829378872343365, "learning_rate": 3.820554553410693e-07, "loss": -0.0, "num_tokens": 162801784.0, "reward": 0.875, "reward_std": 0.14571532607078552, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 1989 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380984365940094, "advantages/var": 0.05466704299203351, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.7168458781362, "grad_norm": 0.038714326891214505, "learning_rate": 3.8117118043135434e-07, "loss": -0.0, "num_tokens": 162877889.0, "reward": 0.65625, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 1990 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.719713261648746, "grad_norm": 0.08081431191174297, "learning_rate": 3.8028768902236454e-07, "loss": 0.0, "num_tokens": 162945368.0, "reward": 0.953125, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 1991 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.2674839170101503e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.72258064516129, "grad_norm": 0.05293243597736124, "learning_rate": 3.794049822326901e-07, "loss": -0.0, "num_tokens": 163032316.0, "reward": 0.8984375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 1992 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.5995914574054795e-09, "advantages/std": 0.4049588143825531, "advantages/var": 0.1639916413461231, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 5.725448028673835, "grad_norm": 0.07045376100312263, "learning_rate": 3.785230611799289e-07, "loss": 0.0, "num_tokens": 163112655.0, "reward": 0.796875, "reward_std": 0.0867956355214119, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1993 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2524951534513563e-09, "advantages/std": 0.5726819038391113, "advantages/var": 0.32796456298478915, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.7283154121863795, "grad_norm": 0.13659353188995949, "learning_rate": 3.7764192698068367e-07, "loss": -0.0, "num_tokens": 163191994.0, "reward": 0.8671875, "reward_std": 0.15756267309188843, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 1994 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983406762715241e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.731182795698925, "grad_norm": 0.11106040148982503, "learning_rate": 3.767615807505602e-07, "loss": 0.0, "num_tokens": 163276687.0, "reward": 0.84375, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 1995 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46757492423057556, "advantages/var": 0.21862630976922848, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.7340501792114695, "grad_norm": 0.0737863864678126, "learning_rate": 3.7588202360416677e-07, "loss": -0.0, "num_tokens": 163349138.0, "reward": 0.8125, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 1996 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1499147049662961e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.736917562724014, "grad_norm": 0.10175601906137141, "learning_rate": 3.7500325665511335e-07, "loss": 0.0, "num_tokens": 163426094.0, "reward": 0.796875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 1997 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5629072505384383e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.7397849462365595, "grad_norm": 0.16061601080350418, "learning_rate": 3.7412528101600914e-07, "loss": 0.0, "num_tokens": 163506487.0, "reward": 0.7734375, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 1998 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.742652329749104, "grad_norm": 0.048944587375708226, "learning_rate": 3.7324809779846113e-07, "loss": 0.0, "num_tokens": 163598578.0, "reward": 0.6953125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 1999 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.0954734451444e-08, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.745519713261649, "grad_norm": 0.17096242893421115, "learning_rate": 3.723717081130727e-07, "loss": 0.0, "num_tokens": 163677199.0, "reward": 0.890625, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2000 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 5.748387096774193, "grad_norm": 0.07559078252278754, "learning_rate": 3.714961130694435e-07, "loss": 0.0, "num_tokens": 163754801.0, "reward": 0.734375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 2001 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.751254480286739, "grad_norm": 0.0, "learning_rate": 3.706213137761669e-07, "loss": 0.0, "num_tokens": 163818241.0, "reward": 0.9375, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2002 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.754121863799283, "grad_norm": 0.09539685136903454, "learning_rate": 3.6974731134082814e-07, "loss": -0.0, "num_tokens": 163887406.0, "reward": 0.9296875, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2003 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299802498719973e-09, "advantages/std": 0.4049576222896576, "advantages/var": 0.16399067585049298, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 5.756989247311828, "grad_norm": 0.10087787426090282, "learning_rate": 3.6887410687000365e-07, "loss": 0.0, "num_tokens": 163973057.0, "reward": 0.8046875, "reward_std": 0.08891239762306213, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2004 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.759856630824372, "grad_norm": 0.07658307484031797, "learning_rate": 3.6800170146926037e-07, "loss": -0.0, "num_tokens": 164053646.0, "reward": 0.8671875, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2005 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.330655038356781, "advantages/var": 0.10933275439072432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.762724014336918, "grad_norm": 0.08196801596565012, "learning_rate": 3.671300962431524e-07, "loss": 0.0, "num_tokens": 164138932.0, "reward": 0.8125, "reward_std": 0.0646936446428299, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2006 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.765591397849462, "grad_norm": 0.11316095264189938, "learning_rate": 3.6625929229522176e-07, "loss": -0.0, "num_tokens": 164209748.0, "reward": 0.90625, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2007 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9875832530345343e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.768458781362007, "grad_norm": 0.23771799442133026, "learning_rate": 3.6538929072799516e-07, "loss": 0.0, "num_tokens": 164290862.0, "reward": 0.8515625, "reward_std": 0.10994865745306015, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2008 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.149905988405531e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.7713261648745515, "grad_norm": 0.07282643843336861, "learning_rate": 3.6452009264298435e-07, "loss": 0.0, "num_tokens": 164364778.0, "reward": 0.9296875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2009 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975300560494157e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.774193548387097, "grad_norm": 0.11096671723426844, "learning_rate": 3.636516991406824e-07, "loss": 0.0, "num_tokens": 164452412.0, "reward": 0.9453125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2010 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 7.31793053783837e-09, "advantages/std": 0.5726962685585022, "advantages/var": 0.3279810160208321, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.7770609318996415, "grad_norm": 0.1953736314427171, "learning_rate": 3.627841113205652e-07, "loss": 0.0, "num_tokens": 164537183.0, "reward": 0.7421875, "reward_std": 0.17358636856079102, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 2011 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.979139449767511e-09, "advantages/std": 0.46761220693588257, "advantages/var": 0.21866117607544666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.779928315412186, "grad_norm": 0.08832565021728052, "learning_rate": 3.619173302810874e-07, "loss": -0.0, "num_tokens": 164617332.0, "reward": 0.8671875, "reward_std": 0.1236182376742363, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2012 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.7827956989247316, "grad_norm": 0.04795474434418676, "learning_rate": 3.6105135711968313e-07, "loss": 0.0, "num_tokens": 164675747.0, "reward": 0.984375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 2013 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.785663082437276, "grad_norm": 0.0762170456763652, "learning_rate": 3.6018619293276253e-07, "loss": 0.0, "num_tokens": 164750052.0, "reward": 0.921875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2014 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227833986282349, "advantages/var": 0.2733024818812879, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.788530465949821, "grad_norm": 0.10339891260122874, "learning_rate": 3.5932183881571297e-07, "loss": 0.0, "num_tokens": 164830891.0, "reward": 0.9140625, "reward_std": 0.13098490238189697, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2015 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.131336901697577e-10, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.791397849462365, "grad_norm": 0.15732195307573668, "learning_rate": 3.5845829586289454e-07, "loss": 0.0, "num_tokens": 164908518.0, "reward": 0.7890625, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2016 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.794265232974911, "grad_norm": 0.048387986175058725, "learning_rate": 3.57595565167642e-07, "loss": 0.0, "num_tokens": 165000089.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2017 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.987588013390756e-09, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.797132616487455, "grad_norm": 0.10363135854352874, "learning_rate": 3.5673364782226e-07, "loss": 0.0, "num_tokens": 165069221.0, "reward": 0.734375, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 2018 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.8, "grad_norm": 0.08947989574715134, "learning_rate": 3.5587254491802467e-07, "loss": 0.0, "num_tokens": 165152527.0, "reward": 0.796875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2019 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562971027883829e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.802867383512545, "grad_norm": 0.12634591947892704, "learning_rate": 3.5501225754518114e-07, "loss": 0.0, "num_tokens": 165226762.0, "reward": 0.8984375, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2020 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.80573476702509, "grad_norm": 0.08052528615783176, "learning_rate": 3.5415278679294023e-07, "loss": 0.0, "num_tokens": 165295973.0, "reward": 0.9921875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9921875, "rewards/drgrpo_math_reward/std": 0.0883883461356163, "step": 2021 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.808602150537634, "grad_norm": 0.04711272795780744, "learning_rate": 3.532941337494806e-07, "loss": -0.0, "num_tokens": 165370611.0, "reward": 0.9609375, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2022 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.811469534050179, "grad_norm": 0.10591775369957132, "learning_rate": 3.5243629950194544e-07, "loss": 0.0, "num_tokens": 165446015.0, "reward": 0.71875, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 2023 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.5050695213580615e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.8143369175627235, "grad_norm": 0.11084765542165069, "learning_rate": 3.515792851364403e-07, "loss": 0.0, "num_tokens": 165538784.0, "reward": 0.8671875, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2024 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 8.049325332145131e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.817204301075269, "grad_norm": 0.10179440611855767, "learning_rate": 3.5072309173803314e-07, "loss": 0.0, "num_tokens": 165610071.0, "reward": 0.9375, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2025 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 5.8200716845878135, "grad_norm": 0.04839040639140518, "learning_rate": 3.4986772039075285e-07, "loss": 0.0, "num_tokens": 165692835.0, "reward": 0.8203125, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2026 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.822939068100358, "grad_norm": 0.11680509758622568, "learning_rate": 3.4901317217758765e-07, "loss": 0.0, "num_tokens": 165777148.0, "reward": 0.7421875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 2027 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 5.825806451612904, "grad_norm": 0.07852625663584593, "learning_rate": 3.481594481804826e-07, "loss": 0.0, "num_tokens": 165859661.0, "reward": 0.78125, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 2028 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.816724861393605e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.828673835125448, "grad_norm": 0.06671980747464423, "learning_rate": 3.4730654948033955e-07, "loss": 0.0, "num_tokens": 165939101.0, "reward": 0.765625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 2029 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975300560494157e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.831541218637993, "grad_norm": 0.1531783422723006, "learning_rate": 3.4645447715701627e-07, "loss": 0.0, "num_tokens": 166010891.0, "reward": 0.9453125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2030 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.834408602150537, "grad_norm": 0.08238093946583412, "learning_rate": 3.4560323228932363e-07, "loss": 0.0, "num_tokens": 166087279.0, "reward": 0.828125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2031 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.4083154633446115e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.837275985663083, "grad_norm": 0.0572539307371559, "learning_rate": 3.4475281595502494e-07, "loss": -0.0, "num_tokens": 166162887.0, "reward": 0.9453125, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2032 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.4393256658538594e-09, "advantages/std": 0.5726926326751709, "advantages/var": 0.3279768515204182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.840143369175627, "grad_norm": 0.11843726952325846, "learning_rate": 3.439032292308338e-07, "loss": -0.0, "num_tokens": 166233293.0, "reward": 0.859375, "reward_std": 0.1712273508310318, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2033 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6262294661506412e-09, "advantages/std": 0.5726882815361023, "advantages/var": 0.32797186780877396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.843010752688172, "grad_norm": 0.15940737791118242, "learning_rate": 3.4305447319241467e-07, "loss": 0.0, "num_tokens": 166312554.0, "reward": 0.828125, "reward_std": 0.1643974483013153, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2034 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.845878136200717, "grad_norm": 0.16940003745396273, "learning_rate": 3.422065489143798e-07, "loss": -0.0, "num_tokens": 166384633.0, "reward": 0.96875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 2035 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.848745519713262, "grad_norm": 0.0826429917536307, "learning_rate": 3.413594574702882e-07, "loss": -0.0, "num_tokens": 166452558.0, "reward": 0.921875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2036 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.851612903225806, "grad_norm": 0.05171982690009139, "learning_rate": 3.405131999326439e-07, "loss": 0.0, "num_tokens": 166529050.0, "reward": 0.9140625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2037 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.0688864335985631e-08, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.854480286738351, "grad_norm": 0.14120005876554986, "learning_rate": 3.396677773728966e-07, "loss": 0.0, "num_tokens": 166614523.0, "reward": 0.8125, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2038 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 5.857347670250896, "grad_norm": 0.0, "learning_rate": 3.3882319086143705e-07, "loss": 0.0, "num_tokens": 166689571.0, "reward": 0.8125, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2039 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975044643629382e-09, "advantages/std": 0.46760615706443787, "advantages/var": 0.21865551812457173, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.860215053763441, "grad_norm": 0.08531543441631127, "learning_rate": 3.3797944146759914e-07, "loss": 0.0, "num_tokens": 166777193.0, "reward": 0.78125, "reward_std": 0.1157275140285492, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 2040 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 1.0349462367204652e-08, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 5.8630824372759855, "grad_norm": 0.06246146558688164, "learning_rate": 3.371365302596554e-07, "loss": -0.0, "num_tokens": 166869982.0, "reward": 0.8515625, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2041 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.86594982078853, "grad_norm": 0.08762273111327888, "learning_rate": 3.362944583048184e-07, "loss": 0.0, "num_tokens": 166951809.0, "reward": 0.7890625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2042 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.868817204301076, "grad_norm": 0.08145679479089268, "learning_rate": 3.3545322666923714e-07, "loss": -0.0, "num_tokens": 167028854.0, "reward": 0.953125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 2043 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.87168458781362, "grad_norm": 0.09394169673263605, "learning_rate": 3.3461283641799755e-07, "loss": -0.0, "num_tokens": 167107604.0, "reward": 0.7734375, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 2044 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 7.125591088259532e-09, "advantages/std": 0.5228043794631958, "advantages/var": 0.2733244191858972, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.874551971326165, "grad_norm": 0.09850840806702302, "learning_rate": 3.337732886151192e-07, "loss": 0.0, "num_tokens": 167180323.0, "reward": 0.796875, "reward_std": 0.1514892876148224, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2045 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.449710856633628e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.877419354838709, "grad_norm": 0.13922270689328578, "learning_rate": 3.329345843235565e-07, "loss": 0.0, "num_tokens": 167270361.0, "reward": 0.921875, "reward_std": 0.08785156905651093, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2046 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.880286738351255, "grad_norm": 0.06267864593183546, "learning_rate": 3.3209672460519423e-07, "loss": -0.0, "num_tokens": 167354979.0, "reward": 0.8125, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2047 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975076251222236e-09, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.883154121863799, "grad_norm": 0.12744459211179202, "learning_rate": 3.312597105208494e-07, "loss": 0.0, "num_tokens": 167433194.0, "reward": 0.7890625, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2048 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 7.966876488284889e-09, "advantages/std": 0.4675973355770111, "advantages/var": 0.21864726823871994, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 5.886021505376344, "grad_norm": 0.09306765835885951, "learning_rate": 3.30423543130267e-07, "loss": 0.0, "num_tokens": 167528659.0, "reward": 0.6875, "reward_std": 0.11100948601961136, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 2049 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 5.888888888888889, "grad_norm": 0.07762667693115997, "learning_rate": 3.2958822349212137e-07, "loss": 0.0, "num_tokens": 167604439.0, "reward": 0.734375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 2050 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.891756272401434, "grad_norm": 0.0957554588223651, "learning_rate": 3.287537526640121e-07, "loss": 0.0, "num_tokens": 167686517.0, "reward": 0.765625, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 2051 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.894623655913978, "grad_norm": 0.0674792368334321, "learning_rate": 3.279201317024654e-07, "loss": 0.0, "num_tokens": 167771336.0, "reward": 0.7890625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2052 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.974967339721013e-09, "advantages/std": 0.46761220693588257, "advantages/var": 0.21866117607544666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 5.897491039426523, "grad_norm": 0.11858480700480502, "learning_rate": 3.270873616629306e-07, "loss": -0.0, "num_tokens": 167858183.0, "reward": 0.7578125, "reward_std": 0.1236182376742363, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 2053 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.900358422939068, "grad_norm": 0.04302305827166325, "learning_rate": 3.2625544359977963e-07, "loss": -0.0, "num_tokens": 167929560.0, "reward": 0.9609375, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2054 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.903225806451613, "grad_norm": 0.04371432834433029, "learning_rate": 3.2542437856630644e-07, "loss": 0.0, "num_tokens": 168000634.0, "reward": 0.953125, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 2055 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9916920837407456e-09, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 5.9060931899641576, "grad_norm": 0.16814645143740356, "learning_rate": 3.245941676147247e-07, "loss": -0.0, "num_tokens": 168090091.0, "reward": 0.6953125, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 2056 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.9792225930357895e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.908960573476703, "grad_norm": 0.11896140281355601, "learning_rate": 3.237648117961664e-07, "loss": 0.0, "num_tokens": 168160425.0, "reward": 0.84375, "reward_std": 0.11678344756364822, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2057 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.911827956989248, "grad_norm": 0.046553395878668816, "learning_rate": 3.2293631216068064e-07, "loss": 0.0, "num_tokens": 168230872.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2058 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 5.914695340501792, "grad_norm": 0.09596519244867824, "learning_rate": 3.2210866975723327e-07, "loss": -0.0, "num_tokens": 168314832.0, "reward": 0.75, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2059 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599693686636838e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.917562724014337, "grad_norm": 0.09012923952890138, "learning_rate": 3.212818856337047e-07, "loss": -0.0, "num_tokens": 168400345.0, "reward": 0.8984375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2060 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726727247238159, "advantages/var": 0.32795404964259944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 5.920430107526881, "grad_norm": 0.2208097696376133, "learning_rate": 3.204559608368881e-07, "loss": 0.0, "num_tokens": 168485388.0, "reward": 0.8125, "reward_std": 0.1462520956993103, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2061 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.9876049603820392e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 5.923297491039427, "grad_norm": 0.07525616983142458, "learning_rate": 3.196308964124885e-07, "loss": 0.0, "num_tokens": 168575235.0, "reward": 0.8046875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2062 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.926164874551971, "grad_norm": 0.0, "learning_rate": 3.1880669340512257e-07, "loss": 0.0, "num_tokens": 168648678.0, "reward": 0.8125, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2063 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.929032258064516, "grad_norm": 0.037728568946204376, "learning_rate": 3.1798335285831604e-07, "loss": -0.0, "num_tokens": 168724461.0, "reward": 0.921875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2064 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983467187183905e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.931899641577061, "grad_norm": 0.11745170919584264, "learning_rate": 3.171608758145019e-07, "loss": 0.0, "num_tokens": 168808795.0, "reward": 0.8125, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2065 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.934767025089606, "grad_norm": 0.09771017719154974, "learning_rate": 3.1633926331502046e-07, "loss": 0.0, "num_tokens": 168887424.0, "reward": 0.8515625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2066 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.96693437436781e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.93763440860215, "grad_norm": 0.0842407973449496, "learning_rate": 3.1551851640011753e-07, "loss": 0.0, "num_tokens": 168961848.0, "reward": 0.9375, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2067 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.126037115417672e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.940501792114695, "grad_norm": 0.17016598218151802, "learning_rate": 3.14698636108943e-07, "loss": -0.0, "num_tokens": 169036398.0, "reward": 0.90625, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2068 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299846843318419e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.94336917562724, "grad_norm": 0.09523117928043917, "learning_rate": 3.138796234795493e-07, "loss": 0.0, "num_tokens": 169127303.0, "reward": 0.7421875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 2069 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.979341600636731e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 5.946236559139785, "grad_norm": 0.08716642063708144, "learning_rate": 3.1306147954888994e-07, "loss": -0.0, "num_tokens": 169216290.0, "reward": 0.7421875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 2070 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380985856056213, "advantages/var": 0.05466704996011007, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 5.94910394265233, "grad_norm": 0.01805907462468101, "learning_rate": 3.122442053528197e-07, "loss": -0.0, "num_tokens": 169290507.0, "reward": 0.90625, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2071 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262736431211962e-09, "advantages/std": 0.5726727247238159, "advantages/var": 0.32795404964259944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 5.951971326164875, "grad_norm": 0.2207721407920864, "learning_rate": 3.1142780192609087e-07, "loss": 0.0, "num_tokens": 169374304.0, "reward": 0.84375, "reward_std": 0.1462521106004715, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2072 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917668534980524e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.95483870967742, "grad_norm": 0.13343166992067354, "learning_rate": 3.1061227030235437e-07, "loss": 0.0, "num_tokens": 169443130.0, "reward": 0.9453125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2073 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.757858184220564e-09, "advantages/std": 0.5726600289344788, "advantages/var": 0.32793950873923805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.957706093189964, "grad_norm": 0.1864190190940253, "learning_rate": 3.097976115141564e-07, "loss": 0.0, "num_tokens": 169523072.0, "reward": 0.890625, "reward_std": 0.13258251547813416, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2074 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.960573476702509, "grad_norm": 0.0, "learning_rate": 3.0898382659293896e-07, "loss": 0.0, "num_tokens": 169593138.0, "reward": 1.0, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 1.0, "rewards/drgrpo_math_reward/std": 0.0, "step": 2075 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.0349153895649778e-08, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.963440860215054, "grad_norm": 0.11777017743153291, "learning_rate": 3.081709165690367e-07, "loss": 0.0, "num_tokens": 169675624.0, "reward": 0.8359375, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2076 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.9707659080050774e-09, "advantages/std": 0.46761417388916016, "advantages/var": 0.2186630156220417, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.966308243727599, "grad_norm": 0.12218652579101849, "learning_rate": 3.0735888247167764e-07, "loss": 0.0, "num_tokens": 169747260.0, "reward": 0.8359375, "reward_std": 0.12597234547138214, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2077 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299846843318419e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.969175627240143, "grad_norm": 0.08941130491646689, "learning_rate": 3.0654772532897945e-07, "loss": -0.0, "num_tokens": 169830051.0, "reward": 0.8046875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2078 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 5.972043010752688, "grad_norm": 0.07562506176953834, "learning_rate": 3.0573744616795095e-07, "loss": 0.0, "num_tokens": 169914664.0, "reward": 0.7578125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 2079 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899627360122966e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.974910394265233, "grad_norm": 0.0868540417840124, "learning_rate": 3.0492804601448805e-07, "loss": -0.0, "num_tokens": 170005076.0, "reward": 0.65625, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 2080 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.977777777777778, "grad_norm": 0.11204298312949046, "learning_rate": 3.041195258933749e-07, "loss": 0.0, "num_tokens": 170084455.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 2081 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.980645161290322, "grad_norm": 0.10929001150835986, "learning_rate": 3.033118868282802e-07, "loss": 0.0, "num_tokens": 170158408.0, "reward": 0.9609375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2082 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.9876049603820392e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.983512544802867, "grad_norm": 0.17984310267930984, "learning_rate": 3.0250512984175846e-07, "loss": -0.0, "num_tokens": 170236538.0, "reward": 0.9296875, "reward_std": 0.1054728701710701, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2083 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 5.986379928315412, "grad_norm": 0.05927892985218351, "learning_rate": 3.01699255955246e-07, "loss": 0.0, "num_tokens": 170315219.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2084 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.749701315113695e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.989247311827957, "grad_norm": 0.06299542985097878, "learning_rate": 3.008942661890627e-07, "loss": 0.0, "num_tokens": 170394850.0, "reward": 0.8984375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2085 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.394200364231044e-08, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 5.992114695340502, "grad_norm": 0.1320863635374229, "learning_rate": 3.000901615624075e-07, "loss": 0.0, "num_tokens": 170473179.0, "reward": 0.9453125, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2086 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 5.994982078853047, "grad_norm": 0.08514137587578359, "learning_rate": 2.9928694309335913e-07, "loss": 0.0, "num_tokens": 170538199.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2087 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.2470398597593402e-08, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 5.997849462365592, "grad_norm": 0.12653525987136202, "learning_rate": 2.9848461179887474e-07, "loss": 0.0, "num_tokens": 170617091.0, "reward": 0.9453125, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2088 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878705662607237e-09, "advantages/std": 0.5726862549781799, "advantages/var": 0.3279695466409329, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.002867383512545, "grad_norm": 0.1526324127718748, "learning_rate": 2.9768316869478836e-07, "loss": 0.0, "num_tokens": 170694797.0, "reward": 0.8203125, "reward_std": 0.16439256072044373, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2089 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878773704879678e-09, "advantages/std": 0.57267826795578, "advantages/var": 0.3279603985888322, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 6.00573476702509, "grad_norm": 0.1224702558266982, "learning_rate": 2.968826147958088e-07, "loss": 0.0, "num_tokens": 170780478.0, "reward": 0.734375, "reward_std": 0.1552036553621292, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 2090 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 1.1266523706756892e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.008602150537635, "grad_norm": 0.16566949778844922, "learning_rate": 2.9608295111551904e-07, "loss": 0.0, "num_tokens": 170851318.0, "reward": 0.8359375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2091 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.9753001796492024e-09, "advantages/std": 0.4675861597061157, "advantages/var": 0.21863681674871316, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.011469534050179, "grad_norm": 0.088479984386407, "learning_rate": 2.952841786663757e-07, "loss": 0.0, "num_tokens": 170935472.0, "reward": 0.8828125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2092 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 6.014336917562724, "grad_norm": 0.10912541991364041, "learning_rate": 2.9448629845970675e-07, "loss": 0.0, "num_tokens": 171018216.0, "reward": 0.859375, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2093 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.149940263022739e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.017204301075269, "grad_norm": 0.0651784278670309, "learning_rate": 2.936893115057101e-07, "loss": 0.0, "num_tokens": 171101556.0, "reward": 0.8828125, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2094 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.2649061754629257e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.020071684587814, "grad_norm": 0.08269475298043079, "learning_rate": 2.9289321881345254e-07, "loss": 0.0, "num_tokens": 171180424.0, "reward": 0.828125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2095 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 6.022939068100358, "grad_norm": 0.06010288134659172, "learning_rate": 2.920980213908695e-07, "loss": 0.0, "num_tokens": 171262183.0, "reward": 0.734375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 2096 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.025806451612903, "grad_norm": 0.06993152989619203, "learning_rate": 2.9130372024476247e-07, "loss": 0.0, "num_tokens": 171347311.0, "reward": 0.8828125, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2097 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.028673835125448, "grad_norm": 0.08117952217521109, "learning_rate": 2.905103163807982e-07, "loss": 0.0, "num_tokens": 171427232.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2098 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.149940263022739e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.031541218637993, "grad_norm": 0.08009694635732721, "learning_rate": 2.8971781080350665e-07, "loss": 0.0, "num_tokens": 171500263.0, "reward": 0.9453125, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2099 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.449667444137735e-09, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.034408602150537, "grad_norm": 0.07636092627316961, "learning_rate": 2.889262045162817e-07, "loss": -0.0, "num_tokens": 171575919.0, "reward": 0.9296875, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2100 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.037275985663083, "grad_norm": 0.26612624401000295, "learning_rate": 2.8813549852137817e-07, "loss": 0.0, "num_tokens": 171668774.0, "reward": 0.8515625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2101 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.97925877074466e-09, "advantages/std": 0.4676010012626648, "advantages/var": 0.21865069638184664, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.040143369175627, "grad_norm": 0.1473152837871326, "learning_rate": 2.8734569381991083e-07, "loss": -0.0, "num_tokens": 171740920.0, "reward": 0.796875, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2102 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.971194650918909e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.043010752688172, "grad_norm": 0.10967122349031659, "learning_rate": 2.8655679141185283e-07, "loss": 0.0, "num_tokens": 171817552.0, "reward": 0.78125, "reward_std": 0.09863808006048203, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 2103 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33065706491470337, "advantages/var": 0.10933409457800636, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.045878136200717, "grad_norm": 0.10154421462493861, "learning_rate": 2.85768792296036e-07, "loss": -0.0, "num_tokens": 171892113.0, "reward": 0.9375, "reward_std": 0.06681530922651291, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2104 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 6.048745519713262, "grad_norm": 0.04514155771527937, "learning_rate": 2.849816974701482e-07, "loss": 0.0, "num_tokens": 171970780.0, "reward": 0.828125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2105 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.971194650918909e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.051612903225807, "grad_norm": 0.10568532434878228, "learning_rate": 2.841955079307319e-07, "loss": 0.0, "num_tokens": 172055799.0, "reward": 0.90625, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2106 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.054480286738351, "grad_norm": 0.05230799095516218, "learning_rate": 2.8341022467318334e-07, "loss": -0.0, "num_tokens": 172129948.0, "reward": 0.875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2107 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.8996420859237135e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.057347670250896, "grad_norm": 0.06759943098625627, "learning_rate": 2.8262584869175223e-07, "loss": 0.0, "num_tokens": 172217712.0, "reward": 0.8984375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2108 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.060215053763441, "grad_norm": 0.13865722005142697, "learning_rate": 2.818423809795384e-07, "loss": 0.0, "num_tokens": 172302415.0, "reward": 0.890625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2109 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.975166506069069e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 6.063082437275986, "grad_norm": 0.152552453284122, "learning_rate": 2.810598225284928e-07, "loss": 0.0, "num_tokens": 172388646.0, "reward": 0.8046875, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2110 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.0349462367204652e-08, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.06594982078853, "grad_norm": 0.10379550817821541, "learning_rate": 2.8027817432941425e-07, "loss": 0.0, "num_tokens": 172459177.0, "reward": 0.9609375, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2111 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917668534980524e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.068817204301075, "grad_norm": 0.12102448385517477, "learning_rate": 2.7949743737194985e-07, "loss": 0.0, "num_tokens": 172541974.0, "reward": 0.8828125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2112 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.033763001694113e-09, "advantages/std": 0.6185612082481384, "advantages/var": 0.3826179683493969, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.07168458781362, "grad_norm": 0.1951219789894647, "learning_rate": 2.7871761264459225e-07, "loss": 0.0, "num_tokens": 172629292.0, "reward": 0.828125, "reward_std": 0.1751839816570282, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2113 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.4496412956177784e-09, "advantages/std": 0.40496495366096497, "advantages/var": 0.1639966136936275, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.074551971326165, "grad_norm": 0.07114323224805799, "learning_rate": 2.7793870113468e-07, "loss": 0.0, "num_tokens": 172703660.0, "reward": 0.890625, "reward_std": 0.0936255231499672, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2114 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.50497202691776e-09, "advantages/std": 0.5726835131645203, "advantages/var": 0.32796640625045725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.077419354838709, "grad_norm": 0.12900599247939767, "learning_rate": 2.771607038283942e-07, "loss": 0.0, "num_tokens": 172778530.0, "reward": 0.8515625, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2115 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.080286738351255, "grad_norm": 0.07064987751422128, "learning_rate": 2.7638362171076e-07, "loss": 0.0, "num_tokens": 172852560.0, "reward": 0.9921875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9921875, "rewards/drgrpo_math_reward/std": 0.0883883461356163, "step": 2116 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.5933536669925965e-08, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.0831541218637994, "grad_norm": 0.14764179501176933, "learning_rate": 2.756074557656424e-07, "loss": 0.0, "num_tokens": 172930649.0, "reward": 0.9296875, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2117 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.086021505376344, "grad_norm": 0.0331028529995827, "learning_rate": 2.748322069757476e-07, "loss": 0.0, "num_tokens": 173013002.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2118 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.088888888888889, "grad_norm": 0.10707429291468022, "learning_rate": 2.740578763226193e-07, "loss": -0.0, "num_tokens": 173080690.0, "reward": 0.8046875, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2119 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.7495735248314805e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.091756272401434, "grad_norm": 0.10277021605296317, "learning_rate": 2.7328446478664036e-07, "loss": -0.0, "num_tokens": 173158696.0, "reward": 0.78125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 2120 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983355479520339e-09, "advantages/std": 0.4676070511341095, "advantages/var": 0.2186563542703377, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 6.094623655913979, "grad_norm": 0.09058752972675965, "learning_rate": 2.7251197334702835e-07, "loss": -0.0, "num_tokens": 173252745.0, "reward": 0.8046875, "reward_std": 0.12019838392734528, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2121 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983473280509385e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.097491039426523, "grad_norm": 0.11113347953395288, "learning_rate": 2.717404029818371e-07, "loss": 0.0, "num_tokens": 173343239.0, "reward": 0.7578125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 2122 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 6.100358422939068, "grad_norm": 0.0808273680316428, "learning_rate": 2.7096975466795367e-07, "loss": 0.0, "num_tokens": 173431614.0, "reward": 0.7421875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 2123 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.103225806451613, "grad_norm": 0.11090880856332312, "learning_rate": 2.7020002938109756e-07, "loss": 0.0, "num_tokens": 173515046.0, "reward": 0.9296875, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2124 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.106093189964158, "grad_norm": 0.06491809513310076, "learning_rate": 2.6943122809581997e-07, "loss": 0.0, "num_tokens": 173601146.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2125 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.108960573476702, "grad_norm": 0.08643256079925163, "learning_rate": 2.6866335178550257e-07, "loss": 0.0, "num_tokens": 173683728.0, "reward": 0.875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2126 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.2044708608067508e-08, "advantages/std": 0.6185770630836487, "advantages/var": 0.3826375829731923, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 6.111827956989248, "grad_norm": 0.17513950419277075, "learning_rate": 2.678964014223553e-07, "loss": -0.0, "num_tokens": 173780660.0, "reward": 0.6875, "reward_std": 0.19568344950675964, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4653336703777313, "step": 2127 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.114695340501792, "grad_norm": 0.08235783336423096, "learning_rate": 2.6713037797741543e-07, "loss": 0.0, "num_tokens": 173864945.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2128 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.117562724014337, "grad_norm": 0.12465393811280526, "learning_rate": 2.663652824205476e-07, "loss": 0.0, "num_tokens": 173926974.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2129 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 6.120430107526881, "grad_norm": 0.04977249967066248, "learning_rate": 2.656011157204415e-07, "loss": 0.0, "num_tokens": 174004946.0, "reward": 0.8984375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2130 }, { "advantages/mean": -7.916241884231567e-09, "advantages/snr": 1.6929285604174867e-08, "advantages/std": 0.4676063656806946, "advantages/var": 0.21865571322510746, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.123297491039427, "grad_norm": 0.10518807899595124, "learning_rate": 2.648378788446102e-07, "loss": 0.0, "num_tokens": 174084487.0, "reward": 0.796875, "reward_std": 0.11913755536079407, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2131 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983334154224049e-09, "advantages/std": 0.46760955452919006, "advantages/var": 0.21865869548698758, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.1261648745519715, "grad_norm": 0.1088275661340618, "learning_rate": 2.6407557275938955e-07, "loss": 0.0, "num_tokens": 174158577.0, "reward": 0.765625, "reward_std": 0.12020328640937805, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 2132 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899474012416308e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.129032258064516, "grad_norm": 0.0946231878141697, "learning_rate": 2.633141984299374e-07, "loss": -0.0, "num_tokens": 174231906.0, "reward": 0.8828125, "reward_std": 0.08443662524223328, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2133 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.599796258942519e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.131899641577061, "grad_norm": 0.09503508407204056, "learning_rate": 2.625537568202322e-07, "loss": 0.0, "num_tokens": 174304214.0, "reward": 0.828125, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2134 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.134767025089606, "grad_norm": 0.028863618471970904, "learning_rate": 2.6179424889307043e-07, "loss": -0.0, "num_tokens": 174374186.0, "reward": 0.9453125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2135 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.041852881974499e-10, "advantages/std": 0.661276638507843, "advantages/var": 0.4372867926362325, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 6.137634408602151, "grad_norm": 0.14794659570921173, "learning_rate": 2.61035675610067e-07, "loss": 0.0, "num_tokens": 174466360.0, "reward": 0.671875, "reward_std": 0.2109457552433014, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 2136 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.9750579720916185e-09, "advantages/std": 0.4676051139831543, "advantages/var": 0.21865454262319872, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.140501792114695, "grad_norm": 0.11420182933445344, "learning_rate": 2.6027803793165347e-07, "loss": 0.0, "num_tokens": 174546294.0, "reward": 0.8515625, "reward_std": 0.11784427613019943, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2137 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917366402546925e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.14336917562724, "grad_norm": 0.08110650673113542, "learning_rate": 2.595213368170772e-07, "loss": -0.0, "num_tokens": 174637659.0, "reward": 0.8671875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2138 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.235170151758147e-09, "advantages/std": 0.5227810740470886, "advantages/var": 0.27330005138182756, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.146236559139785, "grad_norm": 0.14099708405961642, "learning_rate": 2.5876557322439916e-07, "loss": 0.0, "num_tokens": 174712307.0, "reward": 0.9375, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2139 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.95842995348603e-10, "advantages/std": 0.4676051139831543, "advantages/var": 0.21865454262319872, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 6.14910394265233, "grad_norm": 0.08674400597712102, "learning_rate": 2.5801074811049315e-07, "loss": -0.0, "num_tokens": 174794762.0, "reward": 0.7421875, "reward_std": 0.11784426867961884, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 2140 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966544250856589e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.151971326164874, "grad_norm": 0.04589011037766571, "learning_rate": 2.572568624310458e-07, "loss": 0.0, "num_tokens": 174876942.0, "reward": 0.7890625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2141 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.15483870967742, "grad_norm": 0.06478111910034079, "learning_rate": 2.5650391714055296e-07, "loss": 0.0, "num_tokens": 174958060.0, "reward": 0.90625, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2142 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.157706093189964, "grad_norm": 0.11004705135365284, "learning_rate": 2.5575191319232127e-07, "loss": 0.0, "num_tokens": 175042746.0, "reward": 0.90625, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2143 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.160573476702509, "grad_norm": 0.09478780170202693, "learning_rate": 2.550008515384642e-07, "loss": 0.0, "num_tokens": 175113961.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2144 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.163440860215053, "grad_norm": 0.07327068747479794, "learning_rate": 2.542507331299033e-07, "loss": 0.0, "num_tokens": 175193209.0, "reward": 0.796875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2145 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 9.199247907244247e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.166308243727599, "grad_norm": 0.1129679091727922, "learning_rate": 2.5350155891636495e-07, "loss": -0.0, "num_tokens": 175281416.0, "reward": 0.7265625, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 2146 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.2197141188192748e-08, "advantages/std": 0.5726685523986816, "advantages/var": 0.3279492709064016, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.1691756272401435, "grad_norm": 0.12686317848230652, "learning_rate": 2.52753329846381e-07, "loss": 0.0, "num_tokens": 175371588.0, "reward": 0.875, "reward_std": 0.14283224940299988, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2147 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.172043010752688, "grad_norm": 0.08997490709775456, "learning_rate": 2.5200604686728555e-07, "loss": 0.0, "num_tokens": 175451750.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2148 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 6.174910394265233, "grad_norm": 0.10502798239013662, "learning_rate": 2.5125971092521604e-07, "loss": -0.0, "num_tokens": 175529707.0, "reward": 0.8046875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2149 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.177777777777778, "grad_norm": 0.07182949272215379, "learning_rate": 2.5051432296510976e-07, "loss": 0.0, "num_tokens": 175607197.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2150 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 8.944085194988241e-09, "advantages/std": 0.5726996064186096, "advantages/var": 0.32798483919203036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.180645161290323, "grad_norm": 0.1386335795070189, "learning_rate": 2.4976988393070476e-07, "loss": 0.0, "num_tokens": 175688409.0, "reward": 0.75, "reward_std": 0.17912298440933228, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2151 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.2945924285545487e-08, "advantages/std": 0.4676063656806946, "advantages/var": 0.21865571322510746, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.183512544802867, "grad_norm": 0.09564244720437229, "learning_rate": 2.490263947645367e-07, "loss": 0.0, "num_tokens": 175773164.0, "reward": 0.8125, "reward_std": 0.11913755536079407, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2152 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.186379928315413, "grad_norm": 0.0, "learning_rate": 2.482838564079397e-07, "loss": 0.0, "num_tokens": 175828254.0, "reward": 1.0, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 1.0, "rewards/drgrpo_math_reward/std": 0.0, "step": 2153 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966992261291921e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.189247311827957, "grad_norm": 0.13772180316010407, "learning_rate": 2.4754226980104274e-07, "loss": -0.0, "num_tokens": 175910301.0, "reward": 0.890625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2154 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975300560494157e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 6.192114695340502, "grad_norm": 0.10975078879450248, "learning_rate": 2.4680163588277113e-07, "loss": 0.0, "num_tokens": 175989564.0, "reward": 0.8203125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2155 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.194982078853046, "grad_norm": 0.10427408056680003, "learning_rate": 2.46061955590843e-07, "loss": 0.0, "num_tokens": 176060090.0, "reward": 0.859375, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2156 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.5050938954247684e-09, "advantages/std": 0.5726727843284607, "advantages/var": 0.32795411791051166, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 6.197849462365592, "grad_norm": 0.11036299784771562, "learning_rate": 2.4532322986176925e-07, "loss": 0.0, "num_tokens": 176140610.0, "reward": 0.90625, "reward_std": 0.1462520956993103, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2157 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.200716845878136, "grad_norm": 0.12461236277193655, "learning_rate": 2.4458545963085255e-07, "loss": 0.0, "num_tokens": 176221669.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2158 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.203584229390681, "grad_norm": 0.0, "learning_rate": 2.438486458321859e-07, "loss": 0.0, "num_tokens": 176303479.0, "reward": 1.0, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 1.0, "rewards/drgrpo_math_reward/std": 0.0, "step": 2159 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1499234216592094e-08, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 6.2064516129032254, "grad_norm": 0.08428672676983999, "learning_rate": 2.43112789398651e-07, "loss": 0.0, "num_tokens": 176392961.0, "reward": 0.7734375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 2160 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.209318996415771, "grad_norm": 0.07445476343188323, "learning_rate": 2.423778912619171e-07, "loss": 0.0, "num_tokens": 176475304.0, "reward": 0.90625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2161 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.2121863799283155, "grad_norm": 0.09577628669796408, "learning_rate": 2.4164395235244096e-07, "loss": -0.0, "num_tokens": 176548466.0, "reward": 0.859375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2162 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.21505376344086, "grad_norm": 0.07044764921255095, "learning_rate": 2.409109735994647e-07, "loss": 0.0, "num_tokens": 176633891.0, "reward": 0.8515625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2163 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.217921146953405, "grad_norm": 0.07468729755447735, "learning_rate": 2.4017895593101424e-07, "loss": 0.0, "num_tokens": 176709801.0, "reward": 0.8515625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2164 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9835626514248234e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.22078853046595, "grad_norm": 0.15074373847353156, "learning_rate": 2.3944790027389885e-07, "loss": 0.0, "num_tokens": 176791822.0, "reward": 0.7578125, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 2165 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975166506069069e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.223655913978495, "grad_norm": 0.09859531379863297, "learning_rate": 2.3871780755371e-07, "loss": -0.0, "num_tokens": 176877808.0, "reward": 0.8046875, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2166 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.226523297491039, "grad_norm": 0.09765968271834147, "learning_rate": 2.379886786948204e-07, "loss": 0.0, "num_tokens": 176964248.0, "reward": 0.8828125, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2167 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299829409932592e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.229390681003585, "grad_norm": 0.07368850019587517, "learning_rate": 2.3726051462038155e-07, "loss": 0.0, "num_tokens": 177038422.0, "reward": 0.875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2168 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.449710856633628e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.232258064516129, "grad_norm": 0.13374477118857409, "learning_rate": 2.3653331625232365e-07, "loss": -0.0, "num_tokens": 177123127.0, "reward": 0.703125, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 2169 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.971078240891425e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 6.235125448028674, "grad_norm": 0.13838951832047827, "learning_rate": 2.3580708451135445e-07, "loss": -0.0, "num_tokens": 177210861.0, "reward": 0.8359375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2170 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.449667444137735e-09, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.237992831541218, "grad_norm": 0.07130469931870363, "learning_rate": 2.3508182031695856e-07, "loss": -0.0, "num_tokens": 177291262.0, "reward": 0.8046875, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2171 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814036618739657e-09, "advantages/std": 0.5228026509284973, "advantages/var": 0.2733226118178642, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.240860215053764, "grad_norm": 0.12359943260731508, "learning_rate": 2.3435752458739356e-07, "loss": 0.0, "num_tokens": 177377112.0, "reward": 0.8125, "reward_std": 0.14913517236709595, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2172 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.2674839170101503e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.243727598566308, "grad_norm": 0.11358099968057224, "learning_rate": 2.3363419823969276e-07, "loss": 0.0, "num_tokens": 177457168.0, "reward": 0.9609375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2173 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599693686636838e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.246594982078853, "grad_norm": 0.10146949140676484, "learning_rate": 2.3291184218966163e-07, "loss": -0.0, "num_tokens": 177532809.0, "reward": 0.8671875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2174 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 6.2494623655913975, "grad_norm": 0.07527906630518931, "learning_rate": 2.3219045735187647e-07, "loss": 0.0, "num_tokens": 177618491.0, "reward": 0.796875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2175 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.449710856633628e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.252329749103943, "grad_norm": 0.07065746652929582, "learning_rate": 2.31470044639685e-07, "loss": 0.0, "num_tokens": 177705335.0, "reward": 0.796875, "reward_std": 0.08785156905651093, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2176 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.2998246708054356e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 6.2551971326164875, "grad_norm": 0.0598040880718208, "learning_rate": 2.3075060496520304e-07, "loss": 0.0, "num_tokens": 177782197.0, "reward": 0.9140625, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2177 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.8996420859237135e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.258064516129032, "grad_norm": 0.08774742986790071, "learning_rate": 2.3003213923931543e-07, "loss": 0.0, "num_tokens": 177864611.0, "reward": 0.9609375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2178 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958516906788102e-10, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.260931899641577, "grad_norm": 0.10555678125680912, "learning_rate": 2.2931464837167303e-07, "loss": -0.0, "num_tokens": 177938506.0, "reward": 0.921875, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2179 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.6722005458391303e-09, "advantages/std": 0.5227840542793274, "advantages/var": 0.27330316740873073, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.263799283154122, "grad_norm": 0.15601084798585932, "learning_rate": 2.2859813327069323e-07, "loss": 0.0, "num_tokens": 178019745.0, "reward": 0.84375, "reward_std": 0.13204574584960938, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2180 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 9.199387373273677e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.266666666666667, "grad_norm": 0.09967947574545993, "learning_rate": 2.278825948435571e-07, "loss": 0.0, "num_tokens": 178095181.0, "reward": 0.8359375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2181 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.022367650963836e-09, "advantages/std": 0.6185756921768188, "advantages/var": 0.38263588695203055, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.269534050179211, "grad_norm": 0.18715205810324997, "learning_rate": 2.2716803399621022e-07, "loss": 0.0, "num_tokens": 178178090.0, "reward": 0.796875, "reward_std": 0.19673937559127808, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2182 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975244195968941e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.272401433691757, "grad_norm": 0.1651917055597025, "learning_rate": 2.264544516333594e-07, "loss": 0.0, "num_tokens": 178251810.0, "reward": 0.890625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2183 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814647963303764e-09, "advantages/std": 0.5227847099304199, "advantages/var": 0.2733038529370333, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.275268817204301, "grad_norm": 0.12158024065271945, "learning_rate": 2.2574184865847345e-07, "loss": 0.0, "num_tokens": 178317584.0, "reward": 0.8515625, "reward_std": 0.13310657441616058, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2184 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.278136200716846, "grad_norm": 0.13190135349811286, "learning_rate": 2.250302259737803e-07, "loss": 0.0, "num_tokens": 178386650.0, "reward": 0.875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2185 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983562397524497e-09, "advantages/std": 0.4675827622413635, "advantages/var": 0.2186336395452635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.28100358422939, "grad_norm": 0.20644704536939218, "learning_rate": 2.2431958448026788e-07, "loss": 0.0, "num_tokens": 178455960.0, "reward": 0.9140625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2186 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975243815131171e-09, "advantages/std": 0.4675905704498291, "advantages/var": 0.2186409415735966, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.283870967741936, "grad_norm": 0.14604786537556594, "learning_rate": 2.236099250776805e-07, "loss": 0.0, "num_tokens": 178542847.0, "reward": 0.859375, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2187 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.4536934733273465e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.28673835125448, "grad_norm": 0.11127009467183546, "learning_rate": 2.2290124866452031e-07, "loss": 0.0, "num_tokens": 178622477.0, "reward": 0.796875, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2188 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.289605734767025, "grad_norm": 0.0683364615828289, "learning_rate": 2.2219355613804402e-07, "loss": 0.0, "num_tokens": 178710624.0, "reward": 0.7578125, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 2189 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.29247311827957, "grad_norm": 0.0, "learning_rate": 2.2148684839426258e-07, "loss": 0.0, "num_tokens": 178788195.0, "reward": 0.9375, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2190 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983400669593257e-09, "advantages/std": 0.4676017463207245, "advantages/var": 0.21865139316219118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.295340501792115, "grad_norm": 0.13152233811811465, "learning_rate": 2.2078112632794088e-07, "loss": 0.0, "num_tokens": 178873788.0, "reward": 0.8046875, "reward_std": 0.11336849629878998, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2191 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.2982078853046595, "grad_norm": 0.04896511793291085, "learning_rate": 2.2007639083259543e-07, "loss": -0.0, "num_tokens": 178946382.0, "reward": 0.8828125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2192 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.5933536669925965e-08, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.301075268817204, "grad_norm": 0.3471667942809353, "learning_rate": 2.193726428004936e-07, "loss": 0.0, "num_tokens": 179019434.0, "reward": 0.9296875, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2193 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983539800525091e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.3039426523297495, "grad_norm": 0.09015407532640535, "learning_rate": 2.186698831226521e-07, "loss": -0.0, "num_tokens": 179107353.0, "reward": 0.703125, "reward_std": 0.09863808006048203, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 2194 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562971027883829e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.306810035842294, "grad_norm": 0.09622614826554285, "learning_rate": 2.1796811268883707e-07, "loss": 0.0, "num_tokens": 179193378.0, "reward": 0.8515625, "reward_std": 0.12415502220392227, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2195 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 5.691935831188304e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.309677419354839, "grad_norm": 0.1222421270988558, "learning_rate": 2.1726733238756212e-07, "loss": 0.0, "num_tokens": 179284419.0, "reward": 0.6015625, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4915000796318054, "step": 2196 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 6.312544802867383, "grad_norm": 0.08677875732593755, "learning_rate": 2.165675431060866e-07, "loss": -0.0, "num_tokens": 179371660.0, "reward": 0.84375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2197 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.315412186379929, "grad_norm": 0.09056154349879256, "learning_rate": 2.1586874573041524e-07, "loss": 0.0, "num_tokens": 179441110.0, "reward": 0.9765625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9765625, "rewards/drgrpo_math_reward/std": 0.15188287198543549, "step": 2198 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.318279569892473, "grad_norm": 0.11920633426040753, "learning_rate": 2.1517094114529742e-07, "loss": 0.0, "num_tokens": 179504970.0, "reward": 0.90625, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2199 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.149905988405531e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.321146953405018, "grad_norm": 0.08747785758403151, "learning_rate": 2.1447413023422556e-07, "loss": 0.0, "num_tokens": 179583427.0, "reward": 0.8671875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2200 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.5050383769830774e-09, "advantages/std": 0.5726776719093323, "advantages/var": 0.3279597159034928, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.324014336917562, "grad_norm": 0.12711324194947438, "learning_rate": 2.1377831387943346e-07, "loss": -0.0, "num_tokens": 179677651.0, "reward": 0.7421875, "reward_std": 0.154142826795578, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 2201 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966544250856589e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.326881720430108, "grad_norm": 0.05299389151286508, "learning_rate": 2.1308349296189566e-07, "loss": 0.0, "num_tokens": 179754552.0, "reward": 0.9140625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2202 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975144418133046e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.329749103942652, "grad_norm": 0.09902881237398599, "learning_rate": 2.123896683613269e-07, "loss": 0.0, "num_tokens": 179832497.0, "reward": 0.9453125, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2203 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 8.049325332145131e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.332616487455197, "grad_norm": 0.10751438341166206, "learning_rate": 2.116968409561809e-07, "loss": -0.0, "num_tokens": 179905768.0, "reward": 0.8125, "reward_std": 0.08785156905651093, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2204 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.335483870967742, "grad_norm": 0.04054469876989426, "learning_rate": 2.1100501162364703e-07, "loss": 0.0, "num_tokens": 179977415.0, "reward": 0.9921875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9921875, "rewards/drgrpo_math_reward/std": 0.0883883461356163, "step": 2205 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.338351254480287, "grad_norm": 0.07256929125803574, "learning_rate": 2.103141812396526e-07, "loss": 0.0, "num_tokens": 180047196.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 2206 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 6.3412186379928315, "grad_norm": 0.07705694937524302, "learning_rate": 2.096243506788602e-07, "loss": 0.0, "num_tokens": 180138073.0, "reward": 0.6796875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 2207 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.344086021505376, "grad_norm": 0.05616303608045406, "learning_rate": 2.0893552081466559e-07, "loss": 0.0, "num_tokens": 180213527.0, "reward": 0.9921875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9921875, "rewards/drgrpo_math_reward/std": 0.0883883461356163, "step": 2208 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 8.049402934764073e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 6.3469534050179215, "grad_norm": 0.09284175268100746, "learning_rate": 2.082476925191977e-07, "loss": -0.0, "num_tokens": 180290202.0, "reward": 0.90625, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2209 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 6.349820788530466, "grad_norm": 0.0, "learning_rate": 2.0756086666331818e-07, "loss": 0.0, "num_tokens": 180364140.0, "reward": 0.8125, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2210 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.1383829008079518e-08, "advantages/std": 0.5726770758628845, "advantages/var": 0.327959033218864, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 6.352688172043011, "grad_norm": 0.18350218141560903, "learning_rate": 2.0687504411661895e-07, "loss": 0.0, "num_tokens": 180441817.0, "reward": 0.75, "reward_std": 0.1530819833278656, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2211 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380984365940094, "advantages/var": 0.05466704299203351, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.355555555555555, "grad_norm": 0.06370705177952349, "learning_rate": 2.0619022574742118e-07, "loss": -0.0, "num_tokens": 180502220.0, "reward": 0.90625, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2212 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9834382441424445e-09, "advantages/std": 0.4675973355770111, "advantages/var": 0.21864726823871994, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.358422939068101, "grad_norm": 0.10124231580148321, "learning_rate": 2.0550641242277577e-07, "loss": 0.0, "num_tokens": 180589674.0, "reward": 0.859375, "reward_std": 0.11100947856903076, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2213 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.361290322580645, "grad_norm": 0.06561819334727217, "learning_rate": 2.0482360500845996e-07, "loss": 0.0, "num_tokens": 180661624.0, "reward": 0.9140625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2214 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.36415770609319, "grad_norm": 0.07398576384984183, "learning_rate": 2.0414180436897844e-07, "loss": 0.0, "num_tokens": 180745594.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2215 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675973355770111, "advantages/var": 0.21864726823871994, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.367025089605734, "grad_norm": 0.13403602765557324, "learning_rate": 2.0346101136756e-07, "loss": 0.0, "num_tokens": 180821614.0, "reward": 0.921875, "reward_std": 0.11100947856903076, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2216 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33063647150993347, "advantages/var": 0.10932047629253905, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.36989247311828, "grad_norm": 0.11993568492098934, "learning_rate": 2.0278122686615918e-07, "loss": 0.0, "num_tokens": 180897012.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2217 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 7.317976997561578e-09, "advantages/std": 0.5726926326751709, "advantages/var": 0.3279768515204182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 6.372759856630824, "grad_norm": 0.11492715960757334, "learning_rate": 2.0210245172545226e-07, "loss": 0.0, "num_tokens": 181002163.0, "reward": 0.625, "reward_std": 0.1712273508310318, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4860251843929291, "step": 2218 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.375627240143369, "grad_norm": 0.11422293213003529, "learning_rate": 2.014246868048385e-07, "loss": 0.0, "num_tokens": 181082317.0, "reward": 0.8046875, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2219 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.378494623655914, "grad_norm": 0.07362381212389328, "learning_rate": 2.007479329624374e-07, "loss": 0.0, "num_tokens": 181159318.0, "reward": 0.8203125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2220 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907185867989424e-10, "advantages/std": 0.5227928161621094, "advantages/var": 0.2733123286307091, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 6.381362007168459, "grad_norm": 0.12756889894538026, "learning_rate": 2.0007219105508933e-07, "loss": 0.0, "num_tokens": 181244107.0, "reward": 0.7578125, "reward_std": 0.1412346363067627, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 2221 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958683201273463e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.3842293906810035, "grad_norm": 0.09031641699231324, "learning_rate": 1.9939746193835228e-07, "loss": 0.0, "num_tokens": 181325577.0, "reward": 0.8828125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2222 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.2584259521113475e-09, "advantages/std": 0.6185652613639832, "advantages/var": 0.3826229825662928, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 6.387096774193548, "grad_norm": 0.1594836306495329, "learning_rate": 1.9872374646650237e-07, "loss": 0.0, "num_tokens": 181422169.0, "reward": 0.65625, "reward_std": 0.1820138692855835, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 2223 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.130995003363801e-09, "advantages/std": 0.5726990103721619, "advantages/var": 0.32798415648125356, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.3899641577060935, "grad_norm": 0.16746281921590245, "learning_rate": 1.980510454925327e-07, "loss": -0.0, "num_tokens": 181503864.0, "reward": 0.7421875, "reward_std": 0.17806214094161987, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 2224 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.392831541218638, "grad_norm": 0.046949961897370086, "learning_rate": 1.97379359868152e-07, "loss": 0.0, "num_tokens": 181589918.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2225 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.504949684853452e-09, "advantages/std": 0.5726854801177979, "advantages/var": 0.32796865913775264, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.395698924731183, "grad_norm": 0.1638038029950196, "learning_rate": 1.967086904437828e-07, "loss": 0.0, "num_tokens": 181666161.0, "reward": 0.765625, "reward_std": 0.15992169082164764, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 2226 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.379866977655094e-08, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.398566308243727, "grad_norm": 0.08787095868807605, "learning_rate": 1.9603903806856105e-07, "loss": -0.0, "num_tokens": 181759538.0, "reward": 0.6640625, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 2227 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450174584180815e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 6.401433691756273, "grad_norm": 0.049987115694361976, "learning_rate": 1.9537040359033563e-07, "loss": 0.0, "num_tokens": 181840738.0, "reward": 0.90625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2228 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.404301075268817, "grad_norm": 0.054362269499827566, "learning_rate": 1.947027878556665e-07, "loss": 0.0, "num_tokens": 181909564.0, "reward": 0.9296875, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2229 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899474012416308e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.407168458781362, "grad_norm": 0.07560767251882519, "learning_rate": 1.9403619170982355e-07, "loss": -0.0, "num_tokens": 181987820.0, "reward": 0.8828125, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2230 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.2998246708054356e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.410035842293907, "grad_norm": 0.16746578952250954, "learning_rate": 1.9337061599678538e-07, "loss": -0.0, "num_tokens": 182063290.0, "reward": 0.8203125, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2231 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.505166341645741e-09, "advantages/std": 0.5726664066314697, "advantages/var": 0.32794681328419983, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.412903225806452, "grad_norm": 0.17678767605376877, "learning_rate": 1.927060615592394e-07, "loss": 0.0, "num_tokens": 182146488.0, "reward": 0.8828125, "reward_std": 0.13941732048988342, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2232 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599614475511504e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.415770609318996, "grad_norm": 0.06865997664293769, "learning_rate": 1.9204252923858e-07, "loss": 0.0, "num_tokens": 182230991.0, "reward": 0.8125, "reward_std": 0.08785156905651093, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2233 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.96693437436781e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.418637992831541, "grad_norm": 0.07838817614893728, "learning_rate": 1.913800198749067e-07, "loss": 0.0, "num_tokens": 182317276.0, "reward": 0.875, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2234 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966859224177393e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 6.421505376344086, "grad_norm": 0.11167399598179967, "learning_rate": 1.9071853430702412e-07, "loss": 0.0, "num_tokens": 182397868.0, "reward": 0.8359375, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2235 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983361318629381e-09, "advantages/std": 0.4676063656806946, "advantages/var": 0.21865571322510746, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 6.424372759856631, "grad_norm": 0.09026306625703183, "learning_rate": 1.9005807337244107e-07, "loss": -0.0, "num_tokens": 182482710.0, "reward": 0.78125, "reward_std": 0.11913755536079407, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 2236 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983266117900325e-09, "advantages/std": 0.46761754155158997, "advantages/var": 0.21866616516675297, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.4272401433691755, "grad_norm": 0.11437771422820203, "learning_rate": 1.8939863790736922e-07, "loss": -0.0, "num_tokens": 182559246.0, "reward": 0.8671875, "reward_std": 0.130448117852211, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2237 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.43010752688172, "grad_norm": 0.09562689718485126, "learning_rate": 1.8874022874672057e-07, "loss": -0.0, "num_tokens": 182628246.0, "reward": 0.9765625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9765625, "rewards/drgrpo_math_reward/std": 0.15188287198543549, "step": 2238 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998246708054356e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 6.4329749103942655, "grad_norm": 0.13211713898500313, "learning_rate": 1.88082846724109e-07, "loss": 0.0, "num_tokens": 182708667.0, "reward": 0.8203125, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2239 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899641578136434e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.43584229390681, "grad_norm": 0.08708094307258604, "learning_rate": 1.8742649267184796e-07, "loss": 0.0, "num_tokens": 182791955.0, "reward": 0.8984375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2240 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.438709677419355, "grad_norm": 0.15321504604046224, "learning_rate": 1.8677116742094856e-07, "loss": 0.0, "num_tokens": 182864080.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2241 }, { "advantages/mean": 6.05359673500061e-09, "advantages/snr": 1.1579395756169161e-08, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.4415770609319, "grad_norm": 0.1586511176955155, "learning_rate": 1.8611687180111956e-07, "loss": -0.0, "num_tokens": 182940100.0, "reward": 0.78125, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 2242 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.444444444444445, "grad_norm": 0.08017374574585492, "learning_rate": 1.8546360664076655e-07, "loss": 0.0, "num_tokens": 183019220.0, "reward": 0.8828125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2243 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.6721560650546284e-09, "advantages/std": 0.5227927565574646, "advantages/var": 0.27331226630895245, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.447311827956989, "grad_norm": 0.09724299830917557, "learning_rate": 1.8481137276699042e-07, "loss": 0.0, "num_tokens": 183105169.0, "reward": 0.7578125, "reward_std": 0.1412346363067627, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 2244 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.450179211469534, "grad_norm": 0.09035132973576847, "learning_rate": 1.841601710055859e-07, "loss": 0.0, "num_tokens": 183174377.0, "reward": 0.828125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2245 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.563018557708836e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.453046594982079, "grad_norm": 0.09789317420625834, "learning_rate": 1.8351000218104084e-07, "loss": 0.0, "num_tokens": 183247680.0, "reward": 0.953125, "reward_std": 0.1173202320933342, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 2246 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344599133202926e-09, "advantages/std": 0.5227646827697754, "advantages/var": 0.2732829135513839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 6.455913978494624, "grad_norm": 0.1637132592879638, "learning_rate": 1.8286086711653604e-07, "loss": 0.0, "num_tokens": 183327463.0, "reward": 0.7890625, "reward_std": 0.11048543453216553, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2247 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.458781362007168, "grad_norm": 0.14268225971430398, "learning_rate": 1.8221276663394314e-07, "loss": 0.0, "num_tokens": 183412394.0, "reward": 0.8203125, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2248 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633244590331673e-09, "advantages/std": 0.33065226674079895, "advantages/var": 0.10933092150082846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.461648745519713, "grad_norm": 0.0952562831504514, "learning_rate": 1.8156570155382355e-07, "loss": 0.0, "num_tokens": 183484205.0, "reward": 0.890625, "reward_std": 0.06233953312039375, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2249 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.464516129032258, "grad_norm": 0.05675163813786104, "learning_rate": 1.8091967269542774e-07, "loss": 0.0, "num_tokens": 183555605.0, "reward": 0.890625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2250 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.467383512544803, "grad_norm": 0.09555172292089295, "learning_rate": 1.8027468087669485e-07, "loss": 0.0, "num_tokens": 183615502.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 2251 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.4702508960573475, "grad_norm": 0.09915418889669067, "learning_rate": 1.7963072691425085e-07, "loss": -0.0, "num_tokens": 183692283.0, "reward": 0.8984375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2252 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.991725342260504e-09, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.473118279569892, "grad_norm": 0.12942772670108643, "learning_rate": 1.7898781162340682e-07, "loss": 0.0, "num_tokens": 183770014.0, "reward": 0.859375, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2253 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.59950277215078e-09, "advantages/std": 0.4049666225910187, "advantages/var": 0.16399796541277656, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.4759856630824375, "grad_norm": 0.09790204833525104, "learning_rate": 1.7834593581816017e-07, "loss": 0.0, "num_tokens": 183853983.0, "reward": 0.796875, "reward_std": 0.09574718773365021, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2254 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33063647150993347, "advantages/var": 0.10932047629253905, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 6.478853046594982, "grad_norm": 0.07205758936828763, "learning_rate": 1.7770510031119102e-07, "loss": 0.0, "num_tokens": 183932942.0, "reward": 0.8828125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2255 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 8.049325332145131e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.481720430107527, "grad_norm": 0.1132707791729546, "learning_rate": 1.770653059138626e-07, "loss": 0.0, "num_tokens": 184006219.0, "reward": 0.75, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2256 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.484587813620072, "grad_norm": 0.06045306560334811, "learning_rate": 1.7642655343622047e-07, "loss": 0.0, "num_tokens": 184090803.0, "reward": 0.7890625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2257 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975144418133046e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.487455197132617, "grad_norm": 0.12755924552355358, "learning_rate": 1.757888436869911e-07, "loss": 0.0, "num_tokens": 184179938.0, "reward": 0.9453125, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2258 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983473280509385e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.490322580645161, "grad_norm": 0.13122605358577782, "learning_rate": 1.7515217747358013e-07, "loss": 0.0, "num_tokens": 184261909.0, "reward": 0.8203125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2259 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 1.126665165824288e-08, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.493189964157706, "grad_norm": 0.10031688387770546, "learning_rate": 1.745165556020718e-07, "loss": 0.0, "num_tokens": 184324244.0, "reward": 0.96875, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 2260 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.496057347670251, "grad_norm": 0.07288569607456609, "learning_rate": 1.738819788772291e-07, "loss": -0.0, "num_tokens": 184396666.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2261 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299811976811062e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.498924731182796, "grad_norm": 0.1640293487561573, "learning_rate": 1.7324844810249128e-07, "loss": -0.0, "num_tokens": 184479153.0, "reward": 0.9296875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2262 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.50179211469534, "grad_norm": 0.03134484975609983, "learning_rate": 1.7261596407997303e-07, "loss": 0.0, "num_tokens": 184550997.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2263 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.504659498207886, "grad_norm": 0.12737389137270913, "learning_rate": 1.7198452761046378e-07, "loss": 0.0, "num_tokens": 184623639.0, "reward": 0.78125, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 2264 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.5629543724332273e-09, "advantages/std": 0.5227810740470886, "advantages/var": 0.27330005138182756, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.50752688172043, "grad_norm": 0.12360684737711228, "learning_rate": 1.7135413949342704e-07, "loss": 0.0, "num_tokens": 184700828.0, "reward": 0.875, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2265 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.0349462367204652e-08, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.510394265232975, "grad_norm": 0.08835364361365992, "learning_rate": 1.70724800526999e-07, "loss": 0.0, "num_tokens": 184778848.0, "reward": 0.8984375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2266 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.5132616487455195, "grad_norm": 0.030580109273297615, "learning_rate": 1.700965115079871e-07, "loss": 0.0, "num_tokens": 184849812.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2267 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.516129032258064, "grad_norm": 0.075470863165079, "learning_rate": 1.6946927323186942e-07, "loss": -0.0, "num_tokens": 184928483.0, "reward": 0.8359375, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2268 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.958849501312727e-10, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.5189964157706095, "grad_norm": 0.12383606307308048, "learning_rate": 1.688430864927941e-07, "loss": 0.0, "num_tokens": 185009809.0, "reward": 0.875, "reward_std": 0.09863808006048203, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2269 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33065706491470337, "advantages/var": 0.10933409457800636, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.521863799283154, "grad_norm": 0.08853756988079814, "learning_rate": 1.6821795208357824e-07, "loss": 0.0, "num_tokens": 185080483.0, "reward": 0.9375, "reward_std": 0.06681530922651291, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2270 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.524731182795699, "grad_norm": 0.06285731070616504, "learning_rate": 1.675938707957053e-07, "loss": -0.0, "num_tokens": 185160894.0, "reward": 0.828125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2271 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1499234216592094e-08, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.527598566308244, "grad_norm": 0.1703460319537216, "learning_rate": 1.6697084341932631e-07, "loss": 0.0, "num_tokens": 185237373.0, "reward": 0.8359375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2272 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958149428367127e-09, "advantages/std": 0.46761828660964966, "advantages/var": 0.21866686197174445, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.530465949820789, "grad_norm": 0.10918731325145618, "learning_rate": 1.6634887074325842e-07, "loss": 0.0, "num_tokens": 185319943.0, "reward": 0.765625, "reward_std": 0.1315089464187622, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 2273 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5055955396409083e-09, "advantages/std": 0.6185742020606995, "advantages/var": 0.38263404345503105, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 6.533333333333333, "grad_norm": 0.13860061076008143, "learning_rate": 1.6572795355498226e-07, "loss": 0.0, "num_tokens": 185408472.0, "reward": 0.796875, "reward_std": 0.19438527524471283, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2274 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.536200716845878, "grad_norm": 0.07592588942538149, "learning_rate": 1.651080926406425e-07, "loss": 0.0, "num_tokens": 185482156.0, "reward": 0.796875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2275 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.539068100358423, "grad_norm": 0.12832812949240505, "learning_rate": 1.6448928878504686e-07, "loss": 0.0, "num_tokens": 185561234.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2276 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.541935483870968, "grad_norm": 0.0, "learning_rate": 1.638715427716648e-07, "loss": 0.0, "num_tokens": 185630813.0, "reward": 0.875, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2277 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.544802867383512, "grad_norm": 0.06598763109472212, "learning_rate": 1.6325485538262563e-07, "loss": 0.0, "num_tokens": 185707527.0, "reward": 0.921875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2278 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917366402546925e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.547670250896058, "grad_norm": 0.08948401778154895, "learning_rate": 1.6263922739871882e-07, "loss": 0.0, "num_tokens": 185787243.0, "reward": 0.7890625, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2279 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.550537634408602, "grad_norm": 0.10329354184463344, "learning_rate": 1.620246595993925e-07, "loss": 0.0, "num_tokens": 185859425.0, "reward": 0.96875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 2280 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 6.23513460695797e-09, "advantages/std": 0.5227840542793274, "advantages/var": 0.27330316740873073, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 6.553405017921147, "grad_norm": 0.12655950039508182, "learning_rate": 1.6141115276275297e-07, "loss": -0.0, "num_tokens": 185938711.0, "reward": 0.78125, "reward_std": 0.13204574584960938, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 2281 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199317639730369e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.5562724014336915, "grad_norm": 0.09904348854237273, "learning_rate": 1.6079870766556236e-07, "loss": 0.0, "num_tokens": 186014383.0, "reward": 0.921875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2282 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299846843318419e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 6.559139784946236, "grad_norm": 0.08147217328524484, "learning_rate": 1.6018732508323885e-07, "loss": 0.0, "num_tokens": 186091072.0, "reward": 0.8046875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2283 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.5620071684587815, "grad_norm": 0.1543357073643548, "learning_rate": 1.5957700578985557e-07, "loss": 0.0, "num_tokens": 186171198.0, "reward": 0.8203125, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2284 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983467187183905e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.564874551971326, "grad_norm": 0.09278990300735251, "learning_rate": 1.5896775055813973e-07, "loss": 0.0, "num_tokens": 186245214.0, "reward": 0.828125, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2285 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.567741935483871, "grad_norm": 0.05992901706913987, "learning_rate": 1.5835956015947038e-07, "loss": 0.0, "num_tokens": 186326430.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2286 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.967257841033182e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.570609318996416, "grad_norm": 0.1278084908427023, "learning_rate": 1.5775243536387907e-07, "loss": 0.0, "num_tokens": 186403537.0, "reward": 0.84375, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2287 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.2946288161655503e-08, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.573476702508961, "grad_norm": 0.14354059395101898, "learning_rate": 1.5714637694004819e-07, "loss": 0.0, "num_tokens": 186478551.0, "reward": 0.8828125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2288 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016540399721408e-09, "advantages/std": 0.5227880477905273, "advantages/var": 0.2733073429126307, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 6.576344086021505, "grad_norm": 0.12563023202974105, "learning_rate": 1.565413856553095e-07, "loss": 0.0, "num_tokens": 186555219.0, "reward": 0.8359375, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2289 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 8.907227504745508e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 6.57921146953405, "grad_norm": 0.1043286914567863, "learning_rate": 1.559374622756441e-07, "loss": -0.0, "num_tokens": 186637660.0, "reward": 0.828125, "reward_std": 0.13781969249248505, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2290 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.582078853046595, "grad_norm": 0.11406391941071242, "learning_rate": 1.5533460756568128e-07, "loss": -0.0, "num_tokens": 186707110.0, "reward": 0.8828125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2291 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016434378286722e-09, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 6.58494623655914, "grad_norm": 0.12958651018177703, "learning_rate": 1.5473282228869665e-07, "loss": 0.0, "num_tokens": 186785620.0, "reward": 0.84375, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2292 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.587813620071684, "grad_norm": 0.04880423815887028, "learning_rate": 1.5413210720661186e-07, "loss": 0.0, "num_tokens": 186857710.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2293 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.59068100358423, "grad_norm": 0.08622057145780773, "learning_rate": 1.535324630799939e-07, "loss": 0.0, "num_tokens": 186945087.0, "reward": 0.7890625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2294 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633244590331673e-09, "advantages/std": 0.33065226674079895, "advantages/var": 0.10933092150082846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.593548387096774, "grad_norm": 0.06513710758080316, "learning_rate": 1.5293389066805397e-07, "loss": -0.0, "num_tokens": 187019967.0, "reward": 0.953125, "reward_std": 0.06233953312039375, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 2295 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958906628562059e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.596415770609319, "grad_norm": 0.12757381249902855, "learning_rate": 1.523363907286459e-07, "loss": 0.0, "num_tokens": 187097472.0, "reward": 0.8984375, "reward_std": 0.09522313624620438, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2296 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.5992831541218635, "grad_norm": 0.10909393350858522, "learning_rate": 1.517399640182656e-07, "loss": 0.0, "num_tokens": 187166997.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2297 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.602150537634409, "grad_norm": 0.055835861752719866, "learning_rate": 1.511446112920508e-07, "loss": -0.0, "num_tokens": 187247103.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2298 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.979341600636731e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 6.6050179211469535, "grad_norm": 0.10637210493636282, "learning_rate": 1.5055033330377907e-07, "loss": -0.0, "num_tokens": 187327258.0, "reward": 0.7109375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 2299 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.607885304659498, "grad_norm": 0.04125068953551882, "learning_rate": 1.4995713080586735e-07, "loss": -0.0, "num_tokens": 187406269.0, "reward": 0.9375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2300 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599693686636838e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.610752688172043, "grad_norm": 0.11344854653970772, "learning_rate": 1.493650045493703e-07, "loss": 0.0, "num_tokens": 187483942.0, "reward": 0.9609375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2301 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599796258942519e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.613620071684588, "grad_norm": 0.09709970395074634, "learning_rate": 1.4877395528398085e-07, "loss": 0.0, "num_tokens": 187568404.0, "reward": 0.859375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2302 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917366402546925e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.616487455197133, "grad_norm": 0.10086578741031085, "learning_rate": 1.4818398375802833e-07, "loss": 0.0, "num_tokens": 187652589.0, "reward": 0.8671875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2303 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.149940263022739e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 6.619354838709677, "grad_norm": 0.06947631908435752, "learning_rate": 1.4759509071847632e-07, "loss": -0.0, "num_tokens": 187740747.0, "reward": 0.6953125, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 2304 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958683201273463e-10, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.622222222222222, "grad_norm": 0.09733944745106135, "learning_rate": 1.4700727691092418e-07, "loss": -0.0, "num_tokens": 187822220.0, "reward": 0.7578125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 2305 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.625089605734767, "grad_norm": 0.08233489562826543, "learning_rate": 1.464205430796047e-07, "loss": 0.0, "num_tokens": 187907879.0, "reward": 0.8515625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2306 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917335935919526e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 6.627956989247312, "grad_norm": 0.06916655964207967, "learning_rate": 1.458348899673829e-07, "loss": 0.0, "num_tokens": 187997297.0, "reward": 0.796875, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2307 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.630824372759856, "grad_norm": 0.08410716414331963, "learning_rate": 1.452503183157554e-07, "loss": 0.0, "num_tokens": 188082060.0, "reward": 0.8515625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2308 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380985856056213, "advantages/var": 0.05466704996011007, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.633691756272402, "grad_norm": 0.04522199115014992, "learning_rate": 1.4466682886485004e-07, "loss": 0.0, "num_tokens": 188154859.0, "reward": 0.90625, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2309 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.971078240891425e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 6.636559139784946, "grad_norm": 0.10442753300294257, "learning_rate": 1.4408442235342455e-07, "loss": 0.0, "num_tokens": 188238796.0, "reward": 0.8359375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2310 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.639426523297491, "grad_norm": 0.055922668216063266, "learning_rate": 1.4350309951886485e-07, "loss": 0.0, "num_tokens": 188319030.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2311 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.6422939068100355, "grad_norm": 0.08318317416468163, "learning_rate": 1.4292286109718532e-07, "loss": 0.0, "num_tokens": 188397441.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2312 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.4083154633446115e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.645161290322581, "grad_norm": 0.05860914220795526, "learning_rate": 1.4234370782302741e-07, "loss": -0.0, "num_tokens": 188473863.0, "reward": 0.9453125, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2313 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.1950220288145723e-08, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.6480286738351255, "grad_norm": 0.1288524821089973, "learning_rate": 1.4176564042965867e-07, "loss": 0.0, "num_tokens": 188563518.0, "reward": 0.8125, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2314 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.65089605734767, "grad_norm": 0.06351548222281105, "learning_rate": 1.411886596489714e-07, "loss": -0.0, "num_tokens": 188638305.0, "reward": 0.9296875, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2315 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.394200364231044e-08, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.6537634408602155, "grad_norm": 0.11442339823270581, "learning_rate": 1.4061276621148244e-07, "loss": 0.0, "num_tokens": 188710787.0, "reward": 0.9453125, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2316 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494468808174133, "advantages/var": 0.16398020040561878, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.65663082437276, "grad_norm": 0.13153217173799472, "learning_rate": 1.4003796084633201e-07, "loss": 0.0, "num_tokens": 188792398.0, "reward": 0.890625, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2317 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 7.125942055767658e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 6.659498207885305, "grad_norm": 0.09656465916026838, "learning_rate": 1.3946424428128278e-07, "loss": -0.0, "num_tokens": 188891865.0, "reward": 0.6328125, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 2318 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.662365591397849, "grad_norm": 0.08164599281834158, "learning_rate": 1.388916172427187e-07, "loss": -0.0, "num_tokens": 188968480.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2319 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899474012416308e-09, "advantages/std": 0.4049537181854248, "advantages/var": 0.16398751387220045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.665232974910394, "grad_norm": 0.1378308728977492, "learning_rate": 1.383200804556438e-07, "loss": 0.0, "num_tokens": 189049978.0, "reward": 0.8203125, "reward_std": 0.08443661779165268, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2320 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.668100358422939, "grad_norm": 0.08325008115834502, "learning_rate": 1.3774963464368294e-07, "loss": -0.0, "num_tokens": 189136583.0, "reward": 0.75, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2321 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.2649061754629257e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.670967741935484, "grad_norm": 0.10295338191905493, "learning_rate": 1.3718028052907848e-07, "loss": 0.0, "num_tokens": 189214728.0, "reward": 0.953125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 2322 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9835626514248234e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.673835125448028, "grad_norm": 0.12319470024336432, "learning_rate": 1.3661201883269159e-07, "loss": -0.0, "num_tokens": 189282756.0, "reward": 0.8359375, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2323 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299880526045478e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.676702508960574, "grad_norm": 0.08015989045867306, "learning_rate": 1.3604485027399926e-07, "loss": 0.0, "num_tokens": 189365339.0, "reward": 0.9140625, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2324 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899540529955257e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.679569892473118, "grad_norm": 0.08966959448335038, "learning_rate": 1.3547877557109544e-07, "loss": 0.0, "num_tokens": 189443727.0, "reward": 0.7890625, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2325 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.682437275985663, "grad_norm": 0.07473854451423957, "learning_rate": 1.349137954406885e-07, "loss": 0.0, "num_tokens": 189527280.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2326 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9835626514248234e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.6853046594982075, "grad_norm": 0.0840217923546371, "learning_rate": 1.3434991059810153e-07, "loss": 0.0, "num_tokens": 189604224.0, "reward": 0.9609375, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2327 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.688172043010753, "grad_norm": 0.07933183719617533, "learning_rate": 1.3378712175727013e-07, "loss": 0.0, "num_tokens": 189683559.0, "reward": 0.8828125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2328 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966710959040678e-09, "advantages/std": 0.4676070511341095, "advantages/var": 0.2186563542703377, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.6910394265232975, "grad_norm": 0.13014079856538852, "learning_rate": 1.3322542963074314e-07, "loss": 0.0, "num_tokens": 189762737.0, "reward": 0.9140625, "reward_std": 0.12019838392734528, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2329 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016648251989223e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.693906810035842, "grad_norm": 0.11759822323468655, "learning_rate": 1.3266483492967984e-07, "loss": 0.0, "num_tokens": 189839944.0, "reward": 0.8125, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2330 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.6967741935483875, "grad_norm": 0.061950973774611895, "learning_rate": 1.3210533836385085e-07, "loss": 0.0, "num_tokens": 189928901.0, "reward": 0.8046875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2331 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975176026781512e-09, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.699641577060932, "grad_norm": 0.1491010784574104, "learning_rate": 1.315469406416363e-07, "loss": 0.0, "num_tokens": 190001614.0, "reward": 0.84375, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2332 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.5995914574054795e-09, "advantages/std": 0.4049588143825531, "advantages/var": 0.1639916413461231, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.702508960573477, "grad_norm": 0.07483524795785089, "learning_rate": 1.3098964247002497e-07, "loss": -0.0, "num_tokens": 190089738.0, "reward": 0.765625, "reward_std": 0.0867956355214119, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 2333 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.705376344086021, "grad_norm": 0.07908883625271022, "learning_rate": 1.3043344455461315e-07, "loss": -0.0, "num_tokens": 190159776.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2334 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.1312150336472e-10, "advantages/std": 0.5726835131645203, "advantages/var": 0.32796640625045725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 6.708243727598567, "grad_norm": 0.15440760245013682, "learning_rate": 1.298783475996046e-07, "loss": 0.0, "num_tokens": 190242855.0, "reward": 0.7421875, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 2335 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.878747807970186e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.711111111111111, "grad_norm": 0.12867147625864356, "learning_rate": 1.2932435230780937e-07, "loss": 0.0, "num_tokens": 190325072.0, "reward": 0.75, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2336 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.713978494623656, "grad_norm": 0.12791816007869064, "learning_rate": 1.287714593806415e-07, "loss": -0.0, "num_tokens": 190407043.0, "reward": 0.9296875, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2337 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2525469477123842e-09, "advantages/std": 0.5726727843284607, "advantages/var": 0.32795411791051166, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 6.7168458781362, "grad_norm": 0.14296567310307667, "learning_rate": 1.2821966951812045e-07, "loss": 0.0, "num_tokens": 190494543.0, "reward": 0.734375, "reward_std": 0.1462521106004715, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 2338 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.719713261648746, "grad_norm": 0.05962957855453991, "learning_rate": 1.2766898341886912e-07, "loss": 0.0, "num_tokens": 190572934.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2339 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.9752099207640785e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.72258064516129, "grad_norm": 0.10095528134966746, "learning_rate": 1.2711940178011228e-07, "loss": 0.0, "num_tokens": 190648783.0, "reward": 0.9453125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2340 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.725448028673835, "grad_norm": 0.10052310936070973, "learning_rate": 1.2657092529767644e-07, "loss": 0.0, "num_tokens": 190719893.0, "reward": 0.9765625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9765625, "rewards/drgrpo_math_reward/std": 0.15188287198543549, "step": 2341 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.96694656101877e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.7283154121863795, "grad_norm": 0.0944250765680734, "learning_rate": 1.2602355466598912e-07, "loss": 0.0, "num_tokens": 190801481.0, "reward": 0.8359375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2342 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917335935919526e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.731182795698925, "grad_norm": 0.18167790449043975, "learning_rate": 1.254772905780781e-07, "loss": -0.0, "num_tokens": 190874609.0, "reward": 0.875, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2343 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.7340501792114695, "grad_norm": 0.0, "learning_rate": 1.2493213372556933e-07, "loss": 0.0, "num_tokens": 190950231.0, "reward": 0.9375, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2344 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 6.736917562724014, "grad_norm": 0.09988909391940451, "learning_rate": 1.2438808479868711e-07, "loss": -0.0, "num_tokens": 191025493.0, "reward": 0.8203125, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2345 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299897960206267e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 6.7397849462365595, "grad_norm": 0.10481401419743164, "learning_rate": 1.2384514448625337e-07, "loss": 0.0, "num_tokens": 191098496.0, "reward": 0.84375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2346 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.742652329749104, "grad_norm": 0.0, "learning_rate": 1.2330331347568634e-07, "loss": 0.0, "num_tokens": 191159168.0, "reward": 0.875, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2347 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.745519713261649, "grad_norm": 0.0, "learning_rate": 1.2276259245299957e-07, "loss": 0.0, "num_tokens": 191224867.0, "reward": 0.875, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2348 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 5.633513702797483e-09, "advantages/std": 0.33063647150993347, "advantages/var": 0.10932047629253905, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.748387096774193, "grad_norm": 0.07635001452023805, "learning_rate": 1.22222982102801e-07, "loss": 0.0, "num_tokens": 191300309.0, "reward": 0.7578125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 2349 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.672251731040016e-09, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.751254480286739, "grad_norm": 0.11701705073979274, "learning_rate": 1.2168448310829292e-07, "loss": -0.0, "num_tokens": 191385040.0, "reward": 0.7109375, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 2350 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.63344972278721e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.754121863799283, "grad_norm": 0.06459934177246711, "learning_rate": 1.211470961512705e-07, "loss": 0.0, "num_tokens": 191474607.0, "reward": 0.90625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2351 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.756989247311828, "grad_norm": 0.052180544440312226, "learning_rate": 1.2061082191212034e-07, "loss": 0.0, "num_tokens": 191551333.0, "reward": 0.9140625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2352 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.379887186086637e-08, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 6.759856630824372, "grad_norm": 0.06945171785207692, "learning_rate": 1.2007566106982049e-07, "loss": 0.0, "num_tokens": 191635601.0, "reward": 0.8828125, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2353 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 1.126665165824288e-08, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.762724014336918, "grad_norm": 0.10372258196470002, "learning_rate": 1.1954161430193988e-07, "loss": -0.0, "num_tokens": 191703515.0, "reward": 0.78125, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 2354 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.765591397849462, "grad_norm": 0.08257684873620948, "learning_rate": 1.1900868228463601e-07, "loss": 0.0, "num_tokens": 191780782.0, "reward": 0.9921875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9921875, "rewards/drgrpo_math_reward/std": 0.0883883461356163, "step": 2355 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.768458781362007, "grad_norm": 0.045251743367015144, "learning_rate": 1.1847686569265591e-07, "loss": -0.0, "num_tokens": 191856502.0, "reward": 0.8203125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2356 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.7713261648745515, "grad_norm": 0.0, "learning_rate": 1.1794616519933342e-07, "loss": 0.0, "num_tokens": 191935191.0, "reward": 0.875, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2357 }, { "advantages/mean": -9.778887033462524e-09, "advantages/snr": 1.707553557573222e-08, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.774193548387097, "grad_norm": 0.11642091771443897, "learning_rate": 1.1741658147659029e-07, "loss": -0.0, "num_tokens": 192017176.0, "reward": 0.796875, "reward_std": 0.16097761690616608, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2358 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.7770609318996415, "grad_norm": 0.05671825721944357, "learning_rate": 1.1688811519493325e-07, "loss": 0.0, "num_tokens": 192094521.0, "reward": 0.953125, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 2359 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.15796030306511e-08, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.779928315412186, "grad_norm": 0.12421389076108073, "learning_rate": 1.1636076702345532e-07, "loss": -0.0, "num_tokens": 192180594.0, "reward": 0.828125, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2360 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.4083154633446115e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.7827956989247316, "grad_norm": 0.05446125050765403, "learning_rate": 1.1583453762983286e-07, "loss": 0.0, "num_tokens": 192264455.0, "reward": 0.7421875, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 2361 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.785663082437276, "grad_norm": 0.06150132165919982, "learning_rate": 1.1530942768032681e-07, "loss": 0.0, "num_tokens": 192340392.0, "reward": 0.9140625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2362 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.8788021410185465e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.788530465949821, "grad_norm": 0.17767367881423626, "learning_rate": 1.1478543783977945e-07, "loss": -0.0, "num_tokens": 192414442.0, "reward": 0.8046875, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2363 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907256955369e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.791397849462365, "grad_norm": 0.10554860283841914, "learning_rate": 1.1426256877161645e-07, "loss": 0.0, "num_tokens": 192493471.0, "reward": 0.921875, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2364 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975144418133046e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.794265232974911, "grad_norm": 0.09473947993866931, "learning_rate": 1.1374082113784288e-07, "loss": -0.0, "num_tokens": 192575267.0, "reward": 0.7265625, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 2365 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983628920516591e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 6.797132616487455, "grad_norm": 0.10077049301848726, "learning_rate": 1.1322019559904539e-07, "loss": 0.0, "num_tokens": 192669884.0, "reward": 0.796875, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2366 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.1950002462672147e-08, "advantages/std": 0.46760955452919006, "advantages/var": 0.21865869548698758, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.8, "grad_norm": 0.13009809400316338, "learning_rate": 1.1270069281438866e-07, "loss": 0.0, "num_tokens": 192758973.0, "reward": 0.734375, "reward_std": 0.12020328640937805, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 2367 }, { "advantages/mean": 7.450580596923828e-09, "advantages/snr": 1.3010090294948019e-08, "advantages/std": 0.5726770758628845, "advantages/var": 0.327959033218864, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.802867383512545, "grad_norm": 0.28086267927181563, "learning_rate": 1.1218231344161688e-07, "loss": 0.0, "num_tokens": 192839763.0, "reward": 0.828125, "reward_std": 0.1530819833278656, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2368 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.225178756939558e-09, "advantages/std": 0.6612661480903625, "advantages/var": 0.4372729186102653, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.80573476702509, "grad_norm": 0.15400359884936654, "learning_rate": 1.1166505813705185e-07, "loss": 0.0, "num_tokens": 192924907.0, "reward": 0.7265625, "reward_std": 0.19833700358867645, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 2369 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917813257124117e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.808602150537634, "grad_norm": 0.09790313334704302, "learning_rate": 1.111489275555909e-07, "loss": 0.0, "num_tokens": 193009806.0, "reward": 0.8203125, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2370 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.811469534050179, "grad_norm": 0.05408816665345678, "learning_rate": 1.1063392235070878e-07, "loss": 0.0, "num_tokens": 193080934.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2371 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.8143369175627235, "grad_norm": 0.03213567979404306, "learning_rate": 1.10120043174455e-07, "loss": 0.0, "num_tokens": 193148839.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2372 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.817204301075269, "grad_norm": 0.08309195880315318, "learning_rate": 1.09607290677453e-07, "loss": -0.0, "num_tokens": 193225220.0, "reward": 0.953125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 2373 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917221686896894e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.8200716845878135, "grad_norm": 0.15628542485796007, "learning_rate": 1.0909566550890003e-07, "loss": 0.0, "num_tokens": 193297917.0, "reward": 0.8828125, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2374 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.9752007807758586e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.822939068100358, "grad_norm": 0.09904807418569123, "learning_rate": 1.0858516831656594e-07, "loss": 0.0, "num_tokens": 193383409.0, "reward": 0.875, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2375 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.505062750816392e-09, "advantages/std": 0.5726755261421204, "advantages/var": 0.3279572582421544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.825806451612904, "grad_norm": 0.20379846818685546, "learning_rate": 1.0807579974679293e-07, "loss": 0.0, "num_tokens": 193462091.0, "reward": 0.921875, "reward_std": 0.15072786808013916, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2376 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.8787427301770344e-09, "advantages/std": 0.5726819038391113, "advantages/var": 0.32796456298478915, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 6.828673835125448, "grad_norm": 0.1290161235042563, "learning_rate": 1.0756756044449356e-07, "loss": 0.0, "num_tokens": 193551892.0, "reward": 0.8203125, "reward_std": 0.15756267309188843, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2377 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.749701315113695e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.831541218637993, "grad_norm": 0.11597649933266897, "learning_rate": 1.0706045105315064e-07, "loss": 0.0, "num_tokens": 193630545.0, "reward": 0.8359375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2378 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.834408602150537, "grad_norm": 0.07912867494499928, "learning_rate": 1.0655447221481684e-07, "loss": 0.0, "num_tokens": 193699224.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2379 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907227504745508e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.837275985663083, "grad_norm": 0.13735672066171117, "learning_rate": 1.0604962457011346e-07, "loss": 0.0, "num_tokens": 193780696.0, "reward": 0.875, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2380 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.840143369175627, "grad_norm": 0.06164208877522874, "learning_rate": 1.0554590875822921e-07, "loss": 0.0, "num_tokens": 193865866.0, "reward": 0.890625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2381 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.967257841033182e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.843010752688172, "grad_norm": 0.11723371039089976, "learning_rate": 1.0504332541691984e-07, "loss": 0.0, "num_tokens": 193933726.0, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 2382 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.979341600636731e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.845878136200717, "grad_norm": 0.13170995957867837, "learning_rate": 1.0454187518250734e-07, "loss": 0.0, "num_tokens": 194013973.0, "reward": 0.9140625, "reward_std": 0.1054728701710701, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2383 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.149905988405531e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.848745519713262, "grad_norm": 0.09340825441817463, "learning_rate": 1.040415586898794e-07, "loss": -0.0, "num_tokens": 194087202.0, "reward": 0.9296875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2384 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.851612903225806, "grad_norm": 0.10969877961204985, "learning_rate": 1.0354237657248788e-07, "loss": 0.0, "num_tokens": 194168985.0, "reward": 0.8046875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2385 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.971194650918909e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.854480286738351, "grad_norm": 0.12611193260627374, "learning_rate": 1.0304432946234831e-07, "loss": -0.0, "num_tokens": 194247966.0, "reward": 0.84375, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2386 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.857347670250896, "grad_norm": 0.06003868787153907, "learning_rate": 1.0254741799003975e-07, "loss": 0.0, "num_tokens": 194322091.0, "reward": 0.921875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2387 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299807237755752e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 6.860215053763441, "grad_norm": 0.07350127118769947, "learning_rate": 1.0205164278470258e-07, "loss": -0.0, "num_tokens": 194400704.0, "reward": 0.859375, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2388 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.8630824372759855, "grad_norm": 0.07557750424369361, "learning_rate": 1.0155700447403958e-07, "loss": 0.0, "num_tokens": 194478730.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2389 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299897960206267e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 6.86594982078853, "grad_norm": 0.08613156187088937, "learning_rate": 1.0106350368431304e-07, "loss": -0.0, "num_tokens": 194566159.0, "reward": 0.734375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 2390 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.868817204301076, "grad_norm": 0.04729672915690954, "learning_rate": 1.0057114104034604e-07, "loss": 0.0, "num_tokens": 194641042.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2391 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.87168458781362, "grad_norm": 0.03435381276349522, "learning_rate": 1.0007991716551967e-07, "loss": 0.0, "num_tokens": 194717110.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2392 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983629174425397e-09, "advantages/std": 0.46757492423057556, "advantages/var": 0.21862630976922848, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 6.874551971326165, "grad_norm": 0.1423937645833672, "learning_rate": 9.958983268177423e-08, "loss": 0.0, "num_tokens": 194799789.0, "reward": 0.796875, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2393 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.0349462367204652e-08, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 6.877419354838709, "grad_norm": 0.05553013396548312, "learning_rate": 9.91008882096065e-08, "loss": 0.0, "num_tokens": 194881420.0, "reward": 0.7734375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 2394 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.880286738351255, "grad_norm": 0.0711328390587382, "learning_rate": 9.861308436807058e-08, "loss": 0.0, "num_tokens": 194966089.0, "reward": 0.6640625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 2395 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 7.966544250856589e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.883154121863799, "grad_norm": 0.07308119092273002, "learning_rate": 9.812642177477582e-08, "loss": -0.0, "num_tokens": 195042405.0, "reward": 0.8984375, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2396 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.0224013087023915e-09, "advantages/std": 0.6185722351074219, "advantages/var": 0.3826316100457916, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.886021505376344, "grad_norm": 0.14216845633733094, "learning_rate": 9.76409010458874e-08, "loss": 0.0, "num_tokens": 195124172.0, "reward": 0.8203125, "reward_std": 0.190970316529274, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2397 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.2998806953079044e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.888888888888889, "grad_norm": 0.13111727040757493, "learning_rate": 9.715652279612385e-08, "loss": 0.0, "num_tokens": 195194271.0, "reward": 0.8515625, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2398 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966544250856589e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.891756272401434, "grad_norm": 0.06978758286791242, "learning_rate": 9.667328763875815e-08, "loss": 0.0, "num_tokens": 195276428.0, "reward": 0.7890625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2399 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.894623655913978, "grad_norm": 0.01962447601162728, "learning_rate": 9.619119618561511e-08, "loss": 0.0, "num_tokens": 195349877.0, "reward": 0.9921875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9921875, "rewards/drgrpo_math_reward/std": 0.0883883461356163, "step": 2400 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.897491039426523, "grad_norm": 0.09527564749609127, "learning_rate": 9.571024904707237e-08, "loss": 0.0, "num_tokens": 195426482.0, "reward": 0.828125, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2401 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983334154224049e-09, "advantages/std": 0.46760955452919006, "advantages/var": 0.21865869548698758, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.900358422939068, "grad_norm": 0.0949697787491498, "learning_rate": 9.523044683205816e-08, "loss": -0.0, "num_tokens": 195508115.0, "reward": 0.890625, "reward_std": 0.12020329385995865, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2402 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.903225806451613, "grad_norm": 0.07750667496085199, "learning_rate": 9.47517901480509e-08, "loss": 0.0, "num_tokens": 195581496.0, "reward": 0.8984375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2403 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380985856056213, "advantages/var": 0.05466704996011007, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.9060931899641576, "grad_norm": 0.03105968279643574, "learning_rate": 9.427427960107948e-08, "loss": 0.0, "num_tokens": 195657265.0, "reward": 0.84375, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2404 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814855139419146e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.908960573476703, "grad_norm": 0.12653924761950092, "learning_rate": 9.379791579572116e-08, "loss": 0.0, "num_tokens": 195744986.0, "reward": 0.9453125, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2405 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.911827956989248, "grad_norm": 0.0, "learning_rate": 9.332269933510118e-08, "loss": 0.0, "num_tokens": 195810789.0, "reward": 0.9375, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2406 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.7495735248314805e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.914695340501792, "grad_norm": 0.10262501963713686, "learning_rate": 9.284863082089222e-08, "loss": 0.0, "num_tokens": 195888792.0, "reward": 0.828125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2407 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.96238646402201e-09, "advantages/std": 0.4676155745983124, "advantages/var": 0.21866432560690985, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 6.917562724014337, "grad_norm": 0.09030280936181079, "learning_rate": 9.237571085331375e-08, "loss": -0.0, "num_tokens": 195978064.0, "reward": 0.8359375, "reward_std": 0.12809401750564575, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2408 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726727247238159, "advantages/var": 0.32795404964259944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.920430107526881, "grad_norm": 0.126383410799034, "learning_rate": 9.190394003113122e-08, "loss": 0.0, "num_tokens": 196061832.0, "reward": 0.84375, "reward_std": 0.1462521106004715, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2409 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516766163250374e-09, "advantages/std": 0.6185770034790039, "advantages/var": 0.3826375092330636, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 6.923297491039427, "grad_norm": 0.16998489866338543, "learning_rate": 9.143331895165451e-08, "loss": 0.0, "num_tokens": 196146156.0, "reward": 0.828125, "reward_std": 0.19568344950675964, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2410 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 6.926164874551971, "grad_norm": 0.07979723427059401, "learning_rate": 9.0963848210738e-08, "loss": -0.0, "num_tokens": 196227546.0, "reward": 0.8125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2411 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.929032258064516, "grad_norm": 0.01980690782337147, "learning_rate": 9.049552840278008e-08, "loss": 0.0, "num_tokens": 196306204.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2412 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344599133202926e-09, "advantages/std": 0.5227646827697754, "advantages/var": 0.2732829135513839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.931899641577061, "grad_norm": 0.16532044244601588, "learning_rate": 9.002836012072168e-08, "loss": 0.0, "num_tokens": 196391558.0, "reward": 0.8984375, "reward_std": 0.11048543453216553, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2413 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998470125758874e-09, "advantages/std": 0.40494978427886963, "advantages/var": 0.16398432778750305, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.934767025089606, "grad_norm": 0.1256327719558024, "learning_rate": 8.956234395604556e-08, "loss": 0.0, "num_tokens": 196468912.0, "reward": 0.9609375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2414 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599693686636838e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.93763440860215, "grad_norm": 0.09539153420367744, "learning_rate": 8.909748049877586e-08, "loss": -0.0, "num_tokens": 196554755.0, "reward": 0.7265625, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 2415 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.940501792114695, "grad_norm": 0.06816916822524462, "learning_rate": 8.863377033747754e-08, "loss": 0.0, "num_tokens": 196619149.0, "reward": 0.9921875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9921875, "rewards/drgrpo_math_reward/std": 0.0883883461356163, "step": 2416 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.94336917562724, "grad_norm": 0.11626064535405715, "learning_rate": 8.817121405925543e-08, "loss": 0.0, "num_tokens": 196697082.0, "reward": 0.875, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2417 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899421713267256e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.946236559139785, "grad_norm": 0.1246901997257576, "learning_rate": 8.770981224975283e-08, "loss": 0.0, "num_tokens": 196762401.0, "reward": 0.875, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2418 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33063647150993347, "advantages/var": 0.10932047629253905, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.94910394265233, "grad_norm": 0.06247756457977004, "learning_rate": 8.724956549315177e-08, "loss": -0.0, "num_tokens": 196837633.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2419 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.951971326164875, "grad_norm": 0.041928667114635366, "learning_rate": 8.679047437217202e-08, "loss": 0.0, "num_tokens": 196905063.0, "reward": 0.9921875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9921875, "rewards/drgrpo_math_reward/std": 0.0883883461356163, "step": 2420 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.504861672153852e-09, "advantages/std": 0.5726932287216187, "advantages/var": 0.3279775342235922, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 6.95483870967742, "grad_norm": 0.1198628919772755, "learning_rate": 8.633253946806974e-08, "loss": 0.0, "num_tokens": 196990922.0, "reward": 0.7890625, "reward_std": 0.172288179397583, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2421 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998806953079044e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.957706093189964, "grad_norm": 0.06730846439692623, "learning_rate": 8.587576136063767e-08, "loss": -0.0, "num_tokens": 197078931.0, "reward": 0.7734375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 2422 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.126037115417672e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.960573476702509, "grad_norm": 0.11127546833146218, "learning_rate": 8.542014062820369e-08, "loss": 0.0, "num_tokens": 197170175.0, "reward": 0.84375, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2423 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.963440860215054, "grad_norm": 0.08439475203157298, "learning_rate": 8.496567784763032e-08, "loss": -0.0, "num_tokens": 197235679.0, "reward": 0.9296875, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2424 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.966308243727599, "grad_norm": 0.08537979412922808, "learning_rate": 8.451237359431396e-08, "loss": 0.0, "num_tokens": 197302221.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 2425 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 6.969175627240143, "grad_norm": 0.06154497728748271, "learning_rate": 8.406022844218452e-08, "loss": 0.0, "num_tokens": 197385198.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2426 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 6.972043010752688, "grad_norm": 0.06944545136096203, "learning_rate": 8.360924296370375e-08, "loss": 0.0, "num_tokens": 197461982.0, "reward": 0.796875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2427 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 6.974910394265233, "grad_norm": 0.07255534103341262, "learning_rate": 8.31594177298659e-08, "loss": 0.0, "num_tokens": 197548845.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2428 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.7495735248314805e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.977777777777778, "grad_norm": 0.10179773967799725, "learning_rate": 8.271075331019539e-08, "loss": 0.0, "num_tokens": 197630688.0, "reward": 0.859375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2429 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.0689055673126508e-08, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.980645161290322, "grad_norm": 0.18752062690561613, "learning_rate": 8.226325027274783e-08, "loss": 0.0, "num_tokens": 197709725.0, "reward": 0.953125, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 2430 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814855139419146e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 6.983512544802867, "grad_norm": 0.1547020490210697, "learning_rate": 8.181690918410755e-08, "loss": -0.0, "num_tokens": 197789203.0, "reward": 0.8515625, "reward_std": 0.12415502220392227, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2431 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199591840825068e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 6.986379928315412, "grad_norm": 0.10691789449904196, "learning_rate": 8.13717306093884e-08, "loss": 0.0, "num_tokens": 197878737.0, "reward": 0.71875, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 2432 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 1.0349153895649778e-08, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 6.989247311827957, "grad_norm": 0.14593278899579626, "learning_rate": 8.092771511223185e-08, "loss": -0.0, "num_tokens": 197957783.0, "reward": 0.7265625, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 2433 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 6.992114695340502, "grad_norm": 0.09489214535621193, "learning_rate": 8.04848632548073e-08, "loss": -0.0, "num_tokens": 198036099.0, "reward": 0.75, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2434 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 6.994982078853047, "grad_norm": 0.05444184447137259, "learning_rate": 8.004317559781048e-08, "loss": -0.0, "num_tokens": 198126276.0, "reward": 0.75, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2435 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 1.126665165824288e-08, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 6.997849462365592, "grad_norm": 0.10842610808272253, "learning_rate": 7.960265270046306e-08, "loss": 0.0, "num_tokens": 198188792.0, "reward": 0.96875, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 2436 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.0539042852721367e-08, "advantages/std": 0.6185815930366516, "advantages/var": 0.3826431872437617, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 7.002867383512545, "grad_norm": 0.15088482111626852, "learning_rate": 7.916329512051234e-08, "loss": 0.0, "num_tokens": 198274154.0, "reward": 0.6796875, "reward_std": 0.20357416570186615, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 2437 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 7.00573476702509, "grad_norm": 0.11637849863941437, "learning_rate": 7.872510341423021e-08, "loss": 0.0, "num_tokens": 198359529.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2438 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.008602150537635, "grad_norm": 0.0, "learning_rate": 7.828807813641225e-08, "loss": 0.0, "num_tokens": 198436506.0, "reward": 0.875, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2439 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562997839424082e-09, "advantages/std": 0.5227746963500977, "advantages/var": 0.2732933831439368, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.011469534050179, "grad_norm": 0.16305711932052788, "learning_rate": 7.785221984037694e-08, "loss": 0.0, "num_tokens": 198507930.0, "reward": 0.875, "reward_std": 0.12179599702358246, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2440 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814455009491016e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.014336917562724, "grad_norm": 0.11316457630747334, "learning_rate": 7.741752907796583e-08, "loss": 0.0, "num_tokens": 198585558.0, "reward": 0.828125, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2441 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.017204301075269, "grad_norm": 0.05597624489885627, "learning_rate": 7.698400639954216e-08, "loss": -0.0, "num_tokens": 198664801.0, "reward": 0.8046875, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2442 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.020071684587814, "grad_norm": 0.07938407607405072, "learning_rate": 7.655165235398986e-08, "loss": -0.0, "num_tokens": 198737953.0, "reward": 0.7890625, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2443 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5628597236829876e-09, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.022939068100358, "grad_norm": 0.16381168187844106, "learning_rate": 7.612046748871326e-08, "loss": -0.0, "num_tokens": 198822217.0, "reward": 0.84375, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2444 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.025806451612903, "grad_norm": 0.08935558920486494, "learning_rate": 7.56904523496369e-08, "loss": 0.0, "num_tokens": 198897359.0, "reward": 0.7890625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2445 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975144418133046e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.028673835125448, "grad_norm": 0.091064230207462, "learning_rate": 7.526160748120414e-08, "loss": -0.0, "num_tokens": 198971639.0, "reward": 0.9140625, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2446 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 1.1266523706756892e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.031541218637993, "grad_norm": 0.08820505386920832, "learning_rate": 7.483393342637634e-08, "loss": -0.0, "num_tokens": 199044117.0, "reward": 0.8203125, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2447 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.034408602150537, "grad_norm": 0.0, "learning_rate": 7.440743072663258e-08, "loss": 0.0, "num_tokens": 199114910.0, "reward": 0.75, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2448 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.344329800322181e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.037275985663083, "grad_norm": 0.1262476234721405, "learning_rate": 7.398209992196913e-08, "loss": -0.0, "num_tokens": 199205873.0, "reward": 0.7421875, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 2449 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262492693233955e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.040143369175627, "grad_norm": 0.14993712851375646, "learning_rate": 7.355794155089856e-08, "loss": 0.0, "num_tokens": 199296915.0, "reward": 0.859375, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2450 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.301018914496957e-08, "advantages/std": 0.5726727247238159, "advantages/var": 0.32795404964259944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.043010752688172, "grad_norm": 0.1454220498452137, "learning_rate": 7.313495615044873e-08, "loss": 0.0, "num_tokens": 199378036.0, "reward": 0.9375, "reward_std": 0.1462520956993103, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2451 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.4083154633446115e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.045878136200717, "grad_norm": 0.06542823423939587, "learning_rate": 7.271314425616226e-08, "loss": 0.0, "num_tokens": 199442570.0, "reward": 0.8671875, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2452 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917003347966285e-09, "advantages/std": 0.4676017463207245, "advantages/var": 0.21865139316219118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.048745519713262, "grad_norm": 0.10509837429756382, "learning_rate": 7.22925064020966e-08, "loss": 0.0, "num_tokens": 199522282.0, "reward": 0.8359375, "reward_std": 0.11336849629878998, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2453 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975343977137235e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.051612903225807, "grad_norm": 0.08909232513147333, "learning_rate": 7.187304312082243e-08, "loss": -0.0, "num_tokens": 199610659.0, "reward": 0.8515625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2454 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966992261291921e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.054480286738351, "grad_norm": 0.11894177574843282, "learning_rate": 7.145475494342301e-08, "loss": -0.0, "num_tokens": 199688065.0, "reward": 0.953125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 2455 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.126132177603901e-09, "advantages/std": 0.5227646827697754, "advantages/var": 0.2732829135513839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.057347670250896, "grad_norm": 0.12626331846484162, "learning_rate": 7.103764239949405e-08, "loss": 0.0, "num_tokens": 199757273.0, "reward": 0.8515625, "reward_std": 0.11048543453216553, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2456 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.0954489382432772e-08, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.060215053763441, "grad_norm": 0.14694301392642373, "learning_rate": 7.062170601714301e-08, "loss": 0.0, "num_tokens": 199838388.0, "reward": 0.765625, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 2457 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9876049603820392e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.063082437275986, "grad_norm": 0.1048089583277469, "learning_rate": 7.020694632298784e-08, "loss": 0.0, "num_tokens": 199912824.0, "reward": 0.8671875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2458 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.06594982078853, "grad_norm": 0.0, "learning_rate": 6.979336384215695e-08, "loss": 0.0, "num_tokens": 199985682.0, "reward": 0.9375, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2459 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 7.068817204301075, "grad_norm": 0.1195769342952238, "learning_rate": 6.938095909828789e-08, "loss": 0.0, "num_tokens": 200073777.0, "reward": 0.765625, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 2460 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958906628562059e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.07168458781362, "grad_norm": 0.06898870735602258, "learning_rate": 6.896973261352778e-08, "loss": 0.0, "num_tokens": 200152062.0, "reward": 0.8984375, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2461 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.074551971326165, "grad_norm": 0.03610014767434947, "learning_rate": 6.855968490853104e-08, "loss": 0.0, "num_tokens": 200226868.0, "reward": 0.984375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 2462 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.9876548503938182e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.077419354838709, "grad_norm": 0.12298941786189987, "learning_rate": 6.815081650246047e-08, "loss": -0.0, "num_tokens": 200308022.0, "reward": 0.796875, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2463 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.3278497974719924e-09, "advantages/std": 0.7013764381408691, "advantages/var": 0.49192890797917244, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.080286738351255, "grad_norm": 0.21915388336290187, "learning_rate": 6.774312791298509e-08, "loss": 0.0, "num_tokens": 200394472.0, "reward": 0.7734375, "reward_std": 0.21937325596809387, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 2464 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.0831541218637994, "grad_norm": 0.0, "learning_rate": 6.73366196562808e-08, "loss": 0.0, "num_tokens": 200460705.0, "reward": 0.9375, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2465 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 6.971038697911764e-09, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 7.086021505376344, "grad_norm": 0.09577246218740154, "learning_rate": 6.693129224702831e-08, "loss": 0.0, "num_tokens": 200544319.0, "reward": 0.78125, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 2466 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633261853378446e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.088888888888889, "grad_norm": 0.09586888057076874, "learning_rate": 6.652714619841404e-08, "loss": 0.0, "num_tokens": 200619383.0, "reward": 0.8984375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2467 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 7.091756272401434, "grad_norm": 0.04584529705030624, "learning_rate": 6.6124182022128e-08, "loss": -0.0, "num_tokens": 200695502.0, "reward": 0.859375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2468 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33065706491470337, "advantages/var": 0.10933409457800636, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.094623655913979, "grad_norm": 0.07981054314409838, "learning_rate": 6.572240022836451e-08, "loss": -0.0, "num_tokens": 200764514.0, "reward": 0.875, "reward_std": 0.06681530922651291, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2469 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.2674839170101503e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.097491039426523, "grad_norm": 0.06356528690336953, "learning_rate": 6.53218013258201e-08, "loss": 0.0, "num_tokens": 200829199.0, "reward": 0.8359375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2470 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.100358422939068, "grad_norm": 0.05365157869329496, "learning_rate": 6.492238582169451e-08, "loss": 0.0, "num_tokens": 200917552.0, "reward": 0.875, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2471 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9876051507987635e-09, "advantages/std": 0.4675931930541992, "advantages/var": 0.21864339419062162, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.103225806451613, "grad_norm": 0.14637648913578724, "learning_rate": 6.452415422168845e-08, "loss": 0.0, "num_tokens": 201004050.0, "reward": 0.7421875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 2472 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23380985856056213, "advantages/var": 0.05466704996011007, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.106093189964158, "grad_norm": 0.04838502213851617, "learning_rate": 6.412710703000367e-08, "loss": -0.0, "num_tokens": 201073736.0, "reward": 0.84375, "reward_std": 0.033407654613256454, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2473 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.108960573476702, "grad_norm": 0.036619862059781794, "learning_rate": 6.37312447493431e-08, "loss": 0.0, "num_tokens": 201151569.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2474 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966992261291921e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.111827956989248, "grad_norm": 0.11462852942974504, "learning_rate": 6.33365678809088e-08, "loss": -0.0, "num_tokens": 201222822.0, "reward": 0.90625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2475 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599796258942519e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.114695340501792, "grad_norm": 0.08145859769459904, "learning_rate": 6.294307692440215e-08, "loss": 0.0, "num_tokens": 201304069.0, "reward": 0.9375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2476 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.7248074468154485e-08, "advantages/std": 0.4049680531024933, "advantages/var": 0.16399912403362382, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.117562724014337, "grad_norm": 0.07785665518342678, "learning_rate": 6.255077237802286e-08, "loss": 0.0, "num_tokens": 201381240.0, "reward": 0.8671875, "reward_std": 0.09704046696424484, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2477 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 1.1266652673742488e-08, "advantages/std": 0.33064746856689453, "advantages/var": 0.10932774846969551, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.120430107526881, "grad_norm": 0.07746058409982864, "learning_rate": 6.215965473846896e-08, "loss": -0.0, "num_tokens": 201459479.0, "reward": 0.90625, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2478 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149914704966296e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.123297491039427, "grad_norm": 0.07708215383076802, "learning_rate": 6.176972450093543e-08, "loss": -0.0, "num_tokens": 201540054.0, "reward": 0.875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2479 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.1261648745519715, "grad_norm": 0.059661576972812075, "learning_rate": 6.138098215911391e-08, "loss": -0.0, "num_tokens": 201628433.0, "reward": 0.7890625, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2480 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.129032258064516, "grad_norm": 0.056466482049052524, "learning_rate": 6.099342820519183e-08, "loss": 0.0, "num_tokens": 201706500.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2481 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.4393170339684606e-09, "advantages/std": 0.5726946592330933, "advantages/var": 0.3279791727141088, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.131899641577061, "grad_norm": 0.11187153366748942, "learning_rate": 6.060706312985253e-08, "loss": 0.0, "num_tokens": 201798919.0, "reward": 0.7421875, "reward_std": 0.17123225331306458, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 2482 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.041969672843697e-10, "advantages/std": 0.6612656712532043, "advantages/var": 0.4372722879779509, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.134767025089606, "grad_norm": 0.17394203671347708, "learning_rate": 6.022188742227374e-08, "loss": 0.0, "num_tokens": 201878505.0, "reward": 0.796875, "reward_std": 0.19727617502212524, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2483 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.137634408602151, "grad_norm": 0.06380423232231565, "learning_rate": 5.983790157012736e-08, "loss": -0.0, "num_tokens": 201960377.0, "reward": 0.9453125, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2484 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.816724861393605e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.140501792114695, "grad_norm": 0.042265642867532593, "learning_rate": 5.9455106059578596e-08, "loss": -0.0, "num_tokens": 202035597.0, "reward": 0.953125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 2485 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7815011540266774e-09, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.14336917562724, "grad_norm": 0.11748549556727116, "learning_rate": 5.907350137528622e-08, "loss": 0.0, "num_tokens": 202109527.0, "reward": 0.7109375, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 2486 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.991685990697206e-09, "advantages/std": 0.4676051139831543, "advantages/var": 0.21865454262319872, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.146236559139785, "grad_norm": 0.10056455570577012, "learning_rate": 5.8693088000400736e-08, "loss": 0.0, "num_tokens": 202187940.0, "reward": 0.8515625, "reward_std": 0.11784426867961884, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2487 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 7.14910394265233, "grad_norm": 0.07523463956060042, "learning_rate": 5.8313866416564436e-08, "loss": 0.0, "num_tokens": 202273211.0, "reward": 0.8203125, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2488 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983562397524497e-09, "advantages/std": 0.4675827622413635, "advantages/var": 0.2186336395452635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.151971326164874, "grad_norm": 0.13710367582215893, "learning_rate": 5.793583710391059e-08, "loss": 0.0, "num_tokens": 202354516.0, "reward": 0.8046875, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2489 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199023822625729e-09, "advantages/std": 0.4049658179283142, "advantages/var": 0.16399731369034853, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.15483870967742, "grad_norm": 0.18112412261793998, "learning_rate": 5.755900054106333e-08, "loss": 0.0, "num_tokens": 202434011.0, "reward": 0.8046875, "reward_std": 0.094686359167099, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2490 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.157706093189964, "grad_norm": 0.08031875099903989, "learning_rate": 5.718335720513601e-08, "loss": 0.0, "num_tokens": 202498509.0, "reward": 0.9765625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9765625, "rewards/drgrpo_math_reward/std": 0.15188287198543549, "step": 2491 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 7.160573476702509, "grad_norm": 0.057577147177145155, "learning_rate": 5.680890757173207e-08, "loss": -0.0, "num_tokens": 202581140.0, "reward": 0.8203125, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2492 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 7.163440860215053, "grad_norm": 0.08395981953565397, "learning_rate": 5.643565211494283e-08, "loss": -0.0, "num_tokens": 202663619.0, "reward": 0.828125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2493 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599761390615809e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.166308243727599, "grad_norm": 0.10697476091480708, "learning_rate": 5.606359130734806e-08, "loss": 0.0, "num_tokens": 202743150.0, "reward": 0.8984375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2494 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.1691756272401435, "grad_norm": 0.05948306440981583, "learning_rate": 5.56927256200147e-08, "loss": -0.0, "num_tokens": 202822842.0, "reward": 0.875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2495 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149914704966296e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 7.172043010752688, "grad_norm": 0.09644223746896867, "learning_rate": 5.532305552249705e-08, "loss": 0.0, "num_tokens": 202904984.0, "reward": 0.75, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2496 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.4082993428404723e-08, "advantages/std": 0.330655038356781, "advantages/var": 0.10933275439072432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.174910394265233, "grad_norm": 0.06960546121549602, "learning_rate": 5.495458148283505e-08, "loss": 0.0, "num_tokens": 202983316.0, "reward": 0.890625, "reward_std": 0.0646936446428299, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2497 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983361318629381e-09, "advantages/std": 0.4676063656806946, "advantages/var": 0.21865571322510746, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 7.177777777777778, "grad_norm": 0.09059362245450198, "learning_rate": 5.4587303967554954e-08, "loss": -0.0, "num_tokens": 203064975.0, "reward": 0.828125, "reward_std": 0.11913755536079407, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2498 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.0349462367204652e-08, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.180645161290323, "grad_norm": 0.07122944248895587, "learning_rate": 5.422122344166735e-08, "loss": 0.0, "num_tokens": 203154506.0, "reward": 0.8984375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2499 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.183512544802867, "grad_norm": 0.09372971274196308, "learning_rate": 5.385634036866793e-08, "loss": 0.0, "num_tokens": 203233907.0, "reward": 0.765625, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 2500 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.186379928315413, "grad_norm": 0.10943414932965682, "learning_rate": 5.349265521053603e-08, "loss": -0.0, "num_tokens": 203308143.0, "reward": 0.8203125, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2501 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.189247311827957, "grad_norm": 0.14114137293314274, "learning_rate": 5.3130168427734434e-08, "loss": 0.0, "num_tokens": 203373971.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2502 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 8.962749759103603e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.192114695340502, "grad_norm": 0.15433333711897534, "learning_rate": 5.2768880479208356e-08, "loss": 0.0, "num_tokens": 203461764.0, "reward": 0.8515625, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2503 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.194982078853046, "grad_norm": 0.09641316610306644, "learning_rate": 5.2408791822385664e-08, "loss": 0.0, "num_tokens": 203528589.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2504 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.197849462365592, "grad_norm": 0.12211529528959136, "learning_rate": 5.204990291317535e-08, "loss": -0.0, "num_tokens": 203608123.0, "reward": 0.765625, "reward_std": 0.14123955368995667, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 2505 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.200716845878136, "grad_norm": 0.09335209234175323, "learning_rate": 5.1692214205967476e-08, "loss": 0.0, "num_tokens": 203674869.0, "reward": 0.921875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2506 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.203584229390681, "grad_norm": 0.11655707639584922, "learning_rate": 5.133572615363269e-08, "loss": 0.0, "num_tokens": 203753097.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2507 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907227504745508e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 7.2064516129032254, "grad_norm": 0.12189736363683759, "learning_rate": 5.0980439207521485e-08, "loss": -0.0, "num_tokens": 203832120.0, "reward": 0.75, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2508 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.209318996415771, "grad_norm": 0.054068834445732794, "learning_rate": 5.0626353817463606e-08, "loss": -0.0, "num_tokens": 203915326.0, "reward": 0.7578125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 2509 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 8.049325332145131e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.2121863799283155, "grad_norm": 0.09532110290015222, "learning_rate": 5.027347043176722e-08, "loss": 0.0, "num_tokens": 203983549.0, "reward": 0.875, "reward_std": 0.08785156905651093, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2510 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599751573415311e-09, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.21505376344086, "grad_norm": 0.06153929717244519, "learning_rate": 4.9921789497218925e-08, "loss": 0.0, "num_tokens": 204062587.0, "reward": 0.953125, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 2511 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.217921146953405, "grad_norm": 0.06708426684173453, "learning_rate": 4.957131145908311e-08, "loss": 0.0, "num_tokens": 204135455.0, "reward": 0.7890625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2512 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.22078853046595, "grad_norm": 0.049674688426611074, "learning_rate": 4.9222036761100595e-08, "loss": -0.0, "num_tokens": 204205206.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2513 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.0349462367204652e-08, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.223655913978495, "grad_norm": 0.06878477145620411, "learning_rate": 4.887396584548909e-08, "loss": 0.0, "num_tokens": 204290056.0, "reward": 0.7734375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 2514 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.3798774372216438e-08, "advantages/std": 0.4049588143825531, "advantages/var": 0.1639916413461231, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.226523297491039, "grad_norm": 0.11566865259735848, "learning_rate": 4.852709915294195e-08, "loss": -0.0, "num_tokens": 204355728.0, "reward": 0.890625, "reward_std": 0.0867956355214119, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2515 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.229390681003585, "grad_norm": 0.08328792189926526, "learning_rate": 4.818143712262812e-08, "loss": 0.0, "num_tokens": 204438845.0, "reward": 0.8515625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2516 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966992261291921e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 7.232258064516129, "grad_norm": 0.07904085007979987, "learning_rate": 4.783698019219118e-08, "loss": -0.0, "num_tokens": 204523569.0, "reward": 0.78125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 2517 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.235125448028674, "grad_norm": 0.07040339549726506, "learning_rate": 4.7493728797748713e-08, "loss": -0.0, "num_tokens": 204608401.0, "reward": 0.8046875, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2518 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9835626514248234e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.237992831541218, "grad_norm": 0.12930695883543628, "learning_rate": 4.7151683373892306e-08, "loss": 0.0, "num_tokens": 204693577.0, "reward": 0.7265625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 2519 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.757231577566731e-09, "advantages/std": 0.5726968050003052, "advantages/var": 0.32798163045755757, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.240860215053764, "grad_norm": 0.14847926465783368, "learning_rate": 4.681084435368665e-08, "loss": 0.0, "num_tokens": 204779642.0, "reward": 0.890625, "reward_std": 0.17464719712734222, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2520 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.243727598566308, "grad_norm": 0.09385543760658205, "learning_rate": 4.647121216866856e-08, "loss": 0.0, "num_tokens": 204848826.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2521 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016648251989223e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.246594982078853, "grad_norm": 0.12028791578704512, "learning_rate": 4.61327872488475e-08, "loss": 0.0, "num_tokens": 204930284.0, "reward": 0.828125, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2522 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.2494623655913975, "grad_norm": 0.0775232523613318, "learning_rate": 4.5795570022703954e-08, "loss": 0.0, "num_tokens": 205015997.0, "reward": 0.8125, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2523 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.252329749103943, "grad_norm": 0.09731319644944193, "learning_rate": 4.54595609171895e-08, "loss": 0.0, "num_tokens": 205089865.0, "reward": 0.7890625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2524 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.2551971326164875, "grad_norm": 0.09186376937235984, "learning_rate": 4.512476035772628e-08, "loss": 0.0, "num_tokens": 205163985.0, "reward": 0.7578125, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 2525 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.6721527138226332e-09, "advantages/std": 0.5227934122085571, "advantages/var": 0.27331295184866633, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.258064516129032, "grad_norm": 0.14501961184136447, "learning_rate": 4.479116876820588e-08, "loss": -0.0, "num_tokens": 205249213.0, "reward": 0.875, "reward_std": 0.1422954797744751, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2526 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.260931899641577, "grad_norm": 0.10031695362138324, "learning_rate": 4.445878657098978e-08, "loss": -0.0, "num_tokens": 205328564.0, "reward": 0.7734375, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 2527 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.1383701047106082e-08, "advantages/std": 0.5726835131645203, "advantages/var": 0.32796640625045725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.263799283154122, "grad_norm": 0.15816144497770732, "learning_rate": 4.412761418690747e-08, "loss": 0.0, "num_tokens": 205421127.0, "reward": 0.8515625, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2528 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.2649061754629257e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.266666666666667, "grad_norm": 0.06483258155619981, "learning_rate": 4.3797652035257536e-08, "loss": 0.0, "num_tokens": 205505505.0, "reward": 0.828125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2529 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.269534050179211, "grad_norm": 0.11694285020397602, "learning_rate": 4.3468900533805694e-08, "loss": 0.0, "num_tokens": 205575428.0, "reward": 0.9140625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2530 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.272401433691757, "grad_norm": 0.017510490695550458, "learning_rate": 4.314136009878511e-08, "loss": -0.0, "num_tokens": 205647145.0, "reward": 0.8828125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2531 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.449710856633628e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.275268817204301, "grad_norm": 0.07153917715055116, "learning_rate": 4.2815031144895484e-08, "loss": 0.0, "num_tokens": 205718945.0, "reward": 0.875, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2532 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.1950488391937882e-08, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.278136200716846, "grad_norm": 0.17714243005397393, "learning_rate": 4.248991408530278e-08, "loss": 0.0, "num_tokens": 205788402.0, "reward": 0.953125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 2533 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.28100358422939, "grad_norm": 0.10279823826708101, "learning_rate": 4.2166009331638494e-08, "loss": 0.0, "num_tokens": 205858936.0, "reward": 0.96875, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 2534 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.3942077395823529e-08, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.283870967741936, "grad_norm": 0.09854792207558746, "learning_rate": 4.184331729399937e-08, "loss": 0.0, "num_tokens": 205939534.0, "reward": 0.875, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2535 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.28673835125448, "grad_norm": 0.0528413826752298, "learning_rate": 4.152183838094636e-08, "loss": 0.0, "num_tokens": 206021897.0, "reward": 0.9921875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9921875, "rewards/drgrpo_math_reward/std": 0.0883883461356163, "step": 2536 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.1579239975137157e-08, "advantages/std": 0.5227974057197571, "advantages/var": 0.2733171274273083, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.289605734767025, "grad_norm": 0.12729764081841807, "learning_rate": 4.1201572999504995e-08, "loss": 0.0, "num_tokens": 206104077.0, "reward": 0.8671875, "reward_std": 0.14465448260307312, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2537 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.449667444137735e-09, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.29247311827957, "grad_norm": 0.12420338671443103, "learning_rate": 4.088252155516403e-08, "loss": -0.0, "num_tokens": 206178062.0, "reward": 0.9296875, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2538 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814513910737996e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.295340501792115, "grad_norm": 0.15547905086589858, "learning_rate": 4.0564684451874996e-08, "loss": -0.0, "num_tokens": 206269177.0, "reward": 0.828125, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2539 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.2982078853046595, "grad_norm": 0.09306918284991635, "learning_rate": 4.024806209205256e-08, "loss": 0.0, "num_tokens": 206344200.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 2540 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.301075268817204, "grad_norm": 0.0476275501816316, "learning_rate": 3.9932654876573155e-08, "loss": 0.0, "num_tokens": 206409750.0, "reward": 0.9765625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.9765625, "rewards/drgrpo_math_reward/std": 0.15188287198543549, "step": 2541 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.572684645652771, "advantages/var": 0.3279677033664399, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.3039426523297495, "grad_norm": 0.11299043851954678, "learning_rate": 3.9618463204774467e-08, "loss": -0.0, "num_tokens": 206492881.0, "reward": 0.8984375, "reward_std": 0.16203844547271729, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2542 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.749358156051495e-09, "advantages/std": 0.4049680531024933, "advantages/var": 0.16399912403362382, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.306810035842294, "grad_norm": 0.09498043700801338, "learning_rate": 3.930548747445528e-08, "loss": 0.0, "num_tokens": 206566138.0, "reward": 0.8359375, "reward_std": 0.09704046696424484, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2543 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.309677419354839, "grad_norm": 0.06873867862464882, "learning_rate": 3.899372808187506e-08, "loss": 0.0, "num_tokens": 206630974.0, "reward": 0.96875, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 2544 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 7.312544802867383, "grad_norm": 0.0839417986922333, "learning_rate": 3.868318542175331e-08, "loss": -0.0, "num_tokens": 206717574.0, "reward": 0.7734375, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 2545 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.315412186379929, "grad_norm": 0.10666014521366664, "learning_rate": 3.8373859887268714e-08, "loss": 0.0, "num_tokens": 206799899.0, "reward": 0.78125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 2546 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.318279569892473, "grad_norm": 0.13026163817783157, "learning_rate": 3.8065751870059003e-08, "loss": 0.0, "num_tokens": 206887299.0, "reward": 0.71875, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 2547 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.9752099207640785e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.321146953405018, "grad_norm": 0.1197530958272612, "learning_rate": 3.775886176022069e-08, "loss": -0.0, "num_tokens": 206963535.0, "reward": 0.8203125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2548 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983438498026945e-09, "advantages/std": 0.4675973057746887, "advantages/var": 0.21864724036774774, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.324014336917562, "grad_norm": 0.0837039966185904, "learning_rate": 3.7453189946308195e-08, "loss": 0.0, "num_tokens": 207052943.0, "reward": 0.734375, "reward_std": 0.11100947856903076, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 2549 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.6917598869923385e-09, "advantages/std": 0.5726926326751709, "advantages/var": 0.3279768515204182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.326881720430108, "grad_norm": 0.11095626885101538, "learning_rate": 3.714873681533315e-08, "loss": 0.0, "num_tokens": 207147219.0, "reward": 0.890625, "reward_std": 0.1712273508310318, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2550 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2524623174213196e-09, "advantages/std": 0.5726876854896545, "advantages/var": 0.3279711851114975, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.329749103942652, "grad_norm": 0.15933745156749995, "learning_rate": 3.6845502752764544e-08, "loss": 0.0, "num_tokens": 207226913.0, "reward": 0.7265625, "reward_std": 0.1633366346359253, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 2551 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675973057746887, "advantages/var": 0.21864724036774774, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.332616487455197, "grad_norm": 0.08685867984371011, "learning_rate": 3.6543488142527615e-08, "loss": 0.0, "num_tokens": 207307112.0, "reward": 0.859375, "reward_std": 0.11100948601961136, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2552 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.335483870967742, "grad_norm": 0.051231125537430956, "learning_rate": 3.624269336700436e-08, "loss": 0.0, "num_tokens": 207400641.0, "reward": 0.6953125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 2553 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016396905864186e-09, "advantages/std": 0.5227974057197571, "advantages/var": 0.2733171274273083, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.338351254480287, "grad_norm": 0.10042135568757057, "learning_rate": 3.5943118807031046e-08, "loss": -0.0, "num_tokens": 207480069.0, "reward": 0.9140625, "reward_std": 0.14465448260307312, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2554 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.3412186379928315, "grad_norm": 0.15793484797984386, "learning_rate": 3.5644764841900156e-08, "loss": 0.0, "num_tokens": 207557204.0, "reward": 0.71875, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.4513758420944214, "step": 2555 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.344086021505376, "grad_norm": 0.08624708986100145, "learning_rate": 3.5347631849358514e-08, "loss": -0.0, "num_tokens": 207641548.0, "reward": 0.90625, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2556 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.3469534050179215, "grad_norm": 0.06056629963832414, "learning_rate": 3.505172020560687e-08, "loss": 0.0, "num_tokens": 207717287.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2557 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.691702426092771e-09, "advantages/std": 0.5726984143257141, "advantages/var": 0.3279834737711873, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.349820788530466, "grad_norm": 0.13642803320793265, "learning_rate": 3.4757030285299524e-08, "loss": 0.0, "num_tokens": 207799916.0, "reward": 0.734375, "reward_std": 0.17700131237506866, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 2558 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.352688172043011, "grad_norm": 0.04221746753613528, "learning_rate": 3.4463562461544246e-08, "loss": 0.0, "num_tokens": 207884254.0, "reward": 0.9140625, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2559 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.355555555555555, "grad_norm": 0.24809742864211887, "learning_rate": 3.4171317105901486e-08, "loss": -0.0, "num_tokens": 207965121.0, "reward": 0.875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2560 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199317639730369e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.358422939068101, "grad_norm": 0.12351034662831538, "learning_rate": 3.388029458838359e-08, "loss": 0.0, "num_tokens": 208038743.0, "reward": 0.890625, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2561 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 7.361290322580645, "grad_norm": 0.06414865538451837, "learning_rate": 3.3590495277455165e-08, "loss": -0.0, "num_tokens": 208120922.0, "reward": 0.8515625, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2562 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.36415770609319, "grad_norm": 0.06516190830643677, "learning_rate": 3.3301919540031586e-08, "loss": 0.0, "num_tokens": 208192020.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2563 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9875832530345343e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.367025089605734, "grad_norm": 0.0795032768026142, "learning_rate": 3.301456774147959e-08, "loss": 0.0, "num_tokens": 208277136.0, "reward": 0.9140625, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2564 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958906628562059e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.36989247311828, "grad_norm": 0.11348462552313462, "learning_rate": 3.272844024561572e-08, "loss": 0.0, "num_tokens": 208345186.0, "reward": 0.9609375, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2565 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.505166341645741e-09, "advantages/std": 0.5726664066314697, "advantages/var": 0.32794681328419983, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.372759856630824, "grad_norm": 0.1530698628692608, "learning_rate": 3.244353741470707e-08, "loss": 0.0, "num_tokens": 208419884.0, "reward": 0.9140625, "reward_std": 0.13941732048988342, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2566 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975343977137235e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 7.375627240143369, "grad_norm": 0.18265269781254576, "learning_rate": 3.215985960946943e-08, "loss": 0.0, "num_tokens": 208496416.0, "reward": 0.7890625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2567 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.967257841033182e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.378494623655914, "grad_norm": 0.10570541480773055, "learning_rate": 3.18774071890684e-08, "loss": 0.0, "num_tokens": 208578537.0, "reward": 0.84375, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2568 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.381362007168459, "grad_norm": 0.0, "learning_rate": 3.1596180511117235e-08, "loss": 0.0, "num_tokens": 208648020.0, "reward": 1.0, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 1.0, "rewards/drgrpo_math_reward/std": 0.0, "step": 2569 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.0655470077588484e-09, "advantages/std": 0.5726920366287231, "advantages/var": 0.32797616881795477, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.3842293906810035, "grad_norm": 0.1392555252981525, "learning_rate": 3.1316179931678235e-08, "loss": 0.0, "num_tokens": 208736732.0, "reward": 0.8515625, "reward_std": 0.1701665222644806, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2570 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 7.041475135887903e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.387096774193548, "grad_norm": 0.0684602925732368, "learning_rate": 3.10374058052606e-08, "loss": -0.0, "num_tokens": 208817380.0, "reward": 0.9296875, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2571 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.0112061672975522e-09, "advantages/std": 0.6185711026191711, "advantages/var": 0.38263020899549716, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.3899641577060935, "grad_norm": 0.21612746472662595, "learning_rate": 3.075985848482077e-08, "loss": 0.0, "num_tokens": 208895255.0, "reward": 0.8046875, "reward_std": 0.18884865939617157, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2572 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.392831541218638, "grad_norm": 0.10069243812865293, "learning_rate": 3.04835383217622e-08, "loss": -0.0, "num_tokens": 208982394.0, "reward": 0.765625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 2573 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 7.395698924731183, "grad_norm": 0.07040539274525603, "learning_rate": 3.0208445665934836e-08, "loss": 0.0, "num_tokens": 209064964.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2574 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.2946288161655503e-08, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.398566308243727, "grad_norm": 0.10356970342878828, "learning_rate": 2.993458086563405e-08, "loss": 0.0, "num_tokens": 209141967.0, "reward": 0.8828125, "reward_std": 0.1054728701710701, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2575 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 7.966544250856589e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.401433691756273, "grad_norm": 0.05903156545806232, "learning_rate": 2.9661944267600492e-08, "loss": -0.0, "num_tokens": 209225654.0, "reward": 0.8359375, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2576 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.404301075268817, "grad_norm": 0.0, "learning_rate": 2.9390536217020147e-08, "loss": 0.0, "num_tokens": 209296094.0, "reward": 1.0, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 1.0, "rewards/drgrpo_math_reward/std": 0.0, "step": 2577 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.407168458781362, "grad_norm": 0.03914098693528715, "learning_rate": 2.912035705752369e-08, "loss": 0.0, "num_tokens": 209370141.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2578 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.5995914574054795e-09, "advantages/std": 0.4049588143825531, "advantages/var": 0.1639916413461231, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 7.410035842293907, "grad_norm": 0.09784656376581713, "learning_rate": 2.88514071311855e-08, "loss": 0.0, "num_tokens": 209452454.0, "reward": 0.765625, "reward_std": 0.0867956355214119, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 2579 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344527836563254e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 7.412903225806452, "grad_norm": 0.13681151495552854, "learning_rate": 2.858368677852352e-08, "loss": 0.0, "num_tokens": 209547865.0, "reward": 0.828125, "reward_std": 0.1173202320933342, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2580 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.494855892459685e-08, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.415770609318996, "grad_norm": 0.09461766747123089, "learning_rate": 2.8317196338499493e-08, "loss": 0.0, "num_tokens": 209626804.0, "reward": 0.8203125, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2581 }, { "advantages/mean": -7.450580596923828e-09, "advantages/snr": 1.2044967427438685e-08, "advantages/std": 0.6185637712478638, "advantages/var": 0.38262113910037954, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.418637992831541, "grad_norm": 0.12677449799390703, "learning_rate": 2.805193614851742e-08, "loss": 0.0, "num_tokens": 209712096.0, "reward": 0.84375, "reward_std": 0.17965976893901825, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2582 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.1950488391937882e-08, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.421505376344086, "grad_norm": 0.10649650202454893, "learning_rate": 2.7787906544424088e-08, "loss": 0.0, "num_tokens": 209793105.0, "reward": 0.890625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2583 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299846843318419e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.424372759856631, "grad_norm": 0.12365945314753753, "learning_rate": 2.7525107860507767e-08, "loss": 0.0, "num_tokens": 209868130.0, "reward": 0.7421875, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43914902210235596, "step": 2584 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 7.966544250856589e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.4272401433691755, "grad_norm": 0.04767377163044936, "learning_rate": 2.7263540429498744e-08, "loss": -0.0, "num_tokens": 209949629.0, "reward": 0.8359375, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2585 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983496130645961e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.43010752688172, "grad_norm": 0.1227084083377906, "learning_rate": 2.700320458256833e-08, "loss": 0.0, "num_tokens": 210033480.0, "reward": 0.859375, "reward_std": 0.10205792635679245, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2586 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149903618877876e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.4329749103942655, "grad_norm": 0.09206992125443138, "learning_rate": 2.6744100649327973e-08, "loss": 0.0, "num_tokens": 210112615.0, "reward": 0.8125, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2587 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.43584229390681, "grad_norm": 0.06789904170480178, "learning_rate": 2.6486228957830147e-08, "loss": 0.0, "num_tokens": 210185126.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2588 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907665222004876e-09, "advantages/std": 0.5227646827697754, "advantages/var": 0.2732829135513839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.438709677419355, "grad_norm": 0.09063198803444496, "learning_rate": 2.62295898345668e-08, "loss": 0.0, "num_tokens": 210262699.0, "reward": 0.9609375, "reward_std": 0.11048543453216553, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2589 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344329800322181e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.4415770609319, "grad_norm": 0.18881555002510433, "learning_rate": 2.5974183604469347e-08, "loss": 0.0, "num_tokens": 210342850.0, "reward": 0.8515625, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2590 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917480653229804e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.444444444444445, "grad_norm": 0.14514947640500095, "learning_rate": 2.5720010590908115e-08, "loss": -0.0, "num_tokens": 210419831.0, "reward": 0.8125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2591 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.447311827956989, "grad_norm": 0.08303006583482903, "learning_rate": 2.546707111569235e-08, "loss": 0.0, "num_tokens": 210495603.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2592 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016504754270957e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.450179211469534, "grad_norm": 0.11073366251997893, "learning_rate": 2.5215365499069442e-08, "loss": 0.0, "num_tokens": 210581943.0, "reward": 0.859375, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2593 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.453046594982079, "grad_norm": 0.07828569838957963, "learning_rate": 2.496489405972435e-08, "loss": 0.0, "num_tokens": 210664161.0, "reward": 0.6640625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.47417303919792175, "step": 2594 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.455913978494624, "grad_norm": 0.059419224218842485, "learning_rate": 2.471565711477952e-08, "loss": 0.0, "num_tokens": 210726960.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2595 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.458781362007168, "grad_norm": 0.0727835119606027, "learning_rate": 2.4467654979794638e-08, "loss": 0.0, "num_tokens": 210798391.0, "reward": 0.8359375, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2596 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.461648745519713, "grad_norm": 0.08160964499162693, "learning_rate": 2.4220887968765868e-08, "loss": -0.0, "num_tokens": 210873675.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2597 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.449634187350785e-09, "advantages/std": 0.4049657881259918, "advantages/var": 0.1639972895525057, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 7.464516129032258, "grad_norm": 0.1949232815311928, "learning_rate": 2.397535639412551e-08, "loss": -0.0, "num_tokens": 210960638.0, "reward": 0.8515625, "reward_std": 0.094686359167099, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2598 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.958683201273463e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.467383512544803, "grad_norm": 0.11237903132808232, "learning_rate": 2.3731060566741455e-08, "loss": 0.0, "num_tokens": 211029634.0, "reward": 0.8203125, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2599 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975343977137235e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.4702508960573475, "grad_norm": 0.12123751878436803, "learning_rate": 2.3488000795917505e-08, "loss": 0.0, "num_tokens": 211107753.0, "reward": 0.9140625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2600 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.473118279569892, "grad_norm": 0.02415472199030343, "learning_rate": 2.3246177389392384e-08, "loss": 0.0, "num_tokens": 211183526.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2601 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.2674839170101503e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.4759856630824375, "grad_norm": 0.09135367348034021, "learning_rate": 2.3005590653338958e-08, "loss": 0.0, "num_tokens": 211249316.0, "reward": 0.8984375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2602 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.478853046594982, "grad_norm": 0.06598533088634849, "learning_rate": 2.2766240892365006e-08, "loss": 0.0, "num_tokens": 211330585.0, "reward": 0.671875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 2603 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.330655038356781, "advantages/var": 0.10933275439072432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.481720430107527, "grad_norm": 0.09261287130862039, "learning_rate": 2.2528128409511792e-08, "loss": -0.0, "num_tokens": 211408453.0, "reward": 0.75, "reward_std": 0.0646936446428299, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2604 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 7.484587813620072, "grad_norm": 0.06230076952933386, "learning_rate": 2.2291253506253936e-08, "loss": 0.0, "num_tokens": 211484834.0, "reward": 0.921875, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2605 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975144418133046e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.487455197132617, "grad_norm": 0.07346304914257873, "learning_rate": 2.205561648249943e-08, "loss": 0.0, "num_tokens": 211566520.0, "reward": 0.7890625, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2606 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966756148857264e-09, "advantages/std": 0.467604398727417, "advantages/var": 0.21865387370922917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.490322580645161, "grad_norm": 0.11220421222180983, "learning_rate": 2.1821217636589174e-08, "loss": 0.0, "num_tokens": 211644505.0, "reward": 0.671875, "reward_std": 0.11678344011306763, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4713755249977112, "step": 2607 }, { "advantages/mean": 6.05359673500061e-09, "advantages/snr": 1.2945884681216828e-08, "advantages/std": 0.4676077961921692, "advantages/var": 0.21865705105969724, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.493189964157706, "grad_norm": 0.1079590307864344, "learning_rate": 2.1588057265295778e-08, "loss": 0.0, "num_tokens": 211732485.0, "reward": 0.703125, "reward_std": 0.12125921249389648, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45867621898651123, "step": 2608 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149914704966296e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.496057347670251, "grad_norm": 0.07966709391206533, "learning_rate": 2.1356135663824326e-08, "loss": 0.0, "num_tokens": 211813730.0, "reward": 0.75, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2609 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983473280509385e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.498924731182796, "grad_norm": 0.10580857336392352, "learning_rate": 2.1125453125811376e-08, "loss": 0.0, "num_tokens": 211887361.0, "reward": 0.9296875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2610 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.13132843852049e-10, "advantages/std": 0.5726755261421204, "advantages/var": 0.3279572582421544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "epoch": 7.50179211469534, "grad_norm": 0.13645734666840212, "learning_rate": 2.0896009943324632e-08, "loss": 0.0, "num_tokens": 211984432.0, "reward": 0.796875, "reward_std": 0.15072788298130035, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2611 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9876049603820392e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.504659498207886, "grad_norm": 0.08790942342493094, "learning_rate": 2.066780640686272e-08, "loss": -0.0, "num_tokens": 212073616.0, "reward": 0.8671875, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2612 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917668534980524e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.50752688172043, "grad_norm": 0.09131229022780211, "learning_rate": 2.044084280535452e-08, "loss": 0.0, "num_tokens": 212152939.0, "reward": 0.8359375, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2613 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.510394265232975, "grad_norm": 0.08429131273522639, "learning_rate": 2.02151194261595e-08, "loss": -0.0, "num_tokens": 212219148.0, "reward": 0.84375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2614 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.4497441148988883e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.5132616487455195, "grad_norm": 0.10716352857812375, "learning_rate": 1.9990636555066497e-08, "loss": 0.0, "num_tokens": 212301104.0, "reward": 0.859375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2615 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5629027821475993e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.516129032258064, "grad_norm": 0.08845085702452267, "learning_rate": 1.976739447629383e-08, "loss": -0.0, "num_tokens": 212395020.0, "reward": 0.8125, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2616 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983533706996105e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.5189964157706095, "grad_norm": 0.07328193602838753, "learning_rate": 1.9545393472488736e-08, "loss": 0.0, "num_tokens": 212471284.0, "reward": 0.6328125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.4839322865009308, "step": 2617 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.504827144745217e-09, "advantages/std": 0.5726962685585022, "advantages/var": 0.3279810160208321, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.521863799283154, "grad_norm": 0.13204499760923247, "learning_rate": 1.9324633824727266e-08, "loss": 0.0, "num_tokens": 212552830.0, "reward": 0.8515625, "reward_std": 0.17358636856079102, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2618 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 7.524731182795699, "grad_norm": 0.018722105156220862, "learning_rate": 1.910511581251406e-08, "loss": -0.0, "num_tokens": 212645788.0, "reward": 0.8203125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2619 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958237646101366e-10, "advantages/std": 0.46761414408683777, "advantages/var": 0.21866298775006587, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.527598566308244, "grad_norm": 0.08477950311774352, "learning_rate": 1.8886839713781133e-08, "loss": 0.0, "num_tokens": 212728777.0, "reward": 0.7578125, "reward_std": 0.12597234547138214, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 2620 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.563018557708836e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.530465949820789, "grad_norm": 0.15156927480755558, "learning_rate": 1.8669805804888418e-08, "loss": 0.0, "num_tokens": 212811023.0, "reward": 0.859375, "reward_std": 0.1173202320933342, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2621 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899641578136434e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.533333333333333, "grad_norm": 0.15132867500073152, "learning_rate": 1.8454014360623217e-08, "loss": 0.0, "num_tokens": 212878184.0, "reward": 0.9609375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2622 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.878747807970186e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.536200716845878, "grad_norm": 0.1360717886896722, "learning_rate": 1.8239465654199648e-08, "loss": 0.0, "num_tokens": 212957693.0, "reward": 0.90625, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2623 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.539068100358423, "grad_norm": 0.031033377471276283, "learning_rate": 1.8026159957258092e-08, "loss": 0.0, "num_tokens": 213032690.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2624 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.2649061754629257e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.541935483870968, "grad_norm": 0.10799227920738447, "learning_rate": 1.7814097539865625e-08, "loss": 0.0, "num_tokens": 213107605.0, "reward": 0.953125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 2625 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.544802867383512, "grad_norm": 0.07755664155455574, "learning_rate": 1.7603278670515144e-08, "loss": 0.0, "num_tokens": 213178410.0, "reward": 0.9765625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9765625, "rewards/drgrpo_math_reward/std": 0.15188287198543549, "step": 2626 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.547670250896058, "grad_norm": 0.0730335684037607, "learning_rate": 1.73937036161248e-08, "loss": 0.0, "num_tokens": 213258863.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2627 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.550537634408602, "grad_norm": 0.059015984902569284, "learning_rate": 1.718537264203801e-08, "loss": -0.0, "num_tokens": 213330496.0, "reward": 0.9921875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9921875, "rewards/drgrpo_math_reward/std": 0.0883883461356163, "step": 2628 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5629027821475993e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.553405017921147, "grad_norm": 0.11231311667042682, "learning_rate": 1.6978286012023222e-08, "loss": -0.0, "num_tokens": 213421396.0, "reward": 0.75, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2629 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.9709613904608755e-09, "advantages/std": 0.46760106086730957, "advantages/var": 0.21865075212423335, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.5562724014336915, "grad_norm": 0.11214757300032535, "learning_rate": 1.6772443988273377e-08, "loss": 0.0, "num_tokens": 213507711.0, "reward": 0.9375, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2630 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.7495735248314805e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.559139784946236, "grad_norm": 0.09602851981784188, "learning_rate": 1.6567846831405663e-08, "loss": -0.0, "num_tokens": 213590115.0, "reward": 0.65625, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 2631 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 7.5620071684587815, "grad_norm": 0.12431809933411388, "learning_rate": 1.636449480046076e-08, "loss": -0.0, "num_tokens": 213671775.0, "reward": 0.75, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2632 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.564874551971326, "grad_norm": 0.06941940624352973, "learning_rate": 1.6162388152903493e-08, "loss": 0.0, "num_tokens": 213750337.0, "reward": 0.8671875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2633 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.567741935483871, "grad_norm": 0.08651216510041282, "learning_rate": 1.5961527144621402e-08, "loss": 0.0, "num_tokens": 213830135.0, "reward": 0.859375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2634 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1498891480459116e-08, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.570609318996416, "grad_norm": 0.07142790297650317, "learning_rate": 1.5761912029925384e-08, "loss": -0.0, "num_tokens": 213900475.0, "reward": 0.9453125, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2635 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633244590331673e-09, "advantages/std": 0.33065226674079895, "advantages/var": 0.10933092150082846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.573476702508961, "grad_norm": 0.04969763682950268, "learning_rate": 1.5563543061548166e-08, "loss": -0.0, "num_tokens": 213977629.0, "reward": 0.765625, "reward_std": 0.06233953312039375, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 2636 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.907216333870301e-10, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.576344086021505, "grad_norm": 0.14655240702803526, "learning_rate": 1.536642049064574e-08, "loss": -0.0, "num_tokens": 214067352.0, "reward": 0.7265625, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 2637 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998470125758874e-09, "advantages/std": 0.40494978427886963, "advantages/var": 0.16398432778750305, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.57921146953405, "grad_norm": 0.08783129596597063, "learning_rate": 1.5170544566795006e-08, "loss": 0.0, "num_tokens": 214149402.0, "reward": 0.8984375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2638 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.449667444137735e-09, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.582078853046595, "grad_norm": 0.10881172440004862, "learning_rate": 1.4975915537995267e-08, "loss": 0.0, "num_tokens": 214228826.0, "reward": 0.8046875, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2639 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.58494623655914, "grad_norm": 0.044457005489313625, "learning_rate": 1.478253365066673e-08, "loss": 0.0, "num_tokens": 214300863.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2640 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299897960206267e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.587813620071684, "grad_norm": 0.06460679066180552, "learning_rate": 1.4590399149650767e-08, "loss": 0.0, "num_tokens": 214377799.0, "reward": 0.90625, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2641 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516862785289507e-09, "advantages/std": 0.6185637712478638, "advantages/var": 0.38262113910037954, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 7.59068100358423, "grad_norm": 0.1520851435633976, "learning_rate": 1.4399512278209124e-08, "loss": -0.0, "num_tokens": 214463776.0, "reward": 0.8125, "reward_std": 0.17965975403785706, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2642 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899540529955257e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.593548387096774, "grad_norm": 0.10438232518063313, "learning_rate": 1.4209873278024475e-08, "loss": 0.0, "num_tokens": 214554856.0, "reward": 0.8515625, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2643 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.596415770609319, "grad_norm": 0.12431999646667162, "learning_rate": 1.402148238919898e-08, "loss": 0.0, "num_tokens": 214642966.0, "reward": 0.8515625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2644 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.749358156051495e-09, "advantages/std": 0.4049680531024933, "advantages/var": 0.16399912403362382, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.5992831541218635, "grad_norm": 0.08325927045210327, "learning_rate": 1.383433985025495e-08, "loss": -0.0, "num_tokens": 214717771.0, "reward": 0.8359375, "reward_std": 0.09704046696424484, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2645 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.602150537634409, "grad_norm": 0.11131178063694702, "learning_rate": 1.3648445898133964e-08, "loss": 0.0, "num_tokens": 214804945.0, "reward": 0.8515625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2646 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.1950220288145723e-08, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.6050179211469535, "grad_norm": 0.1255617216792842, "learning_rate": 1.3463800768196864e-08, "loss": 0.0, "num_tokens": 214882079.0, "reward": 0.875, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2647 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344527836563254e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.607885304659498, "grad_norm": 0.1825647268533084, "learning_rate": 1.3280404694223313e-08, "loss": 0.0, "num_tokens": 214961601.0, "reward": 0.796875, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2648 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599796258942519e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.610752688172043, "grad_norm": 0.10938545343611937, "learning_rate": 1.309825790841146e-08, "loss": 0.0, "num_tokens": 215050605.0, "reward": 0.859375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2649 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.613620071684588, "grad_norm": 0.0, "learning_rate": 1.2917360641377827e-08, "loss": 0.0, "num_tokens": 215118426.0, "reward": 0.9375, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2650 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.958849501312727e-10, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.616487455197133, "grad_norm": 0.11817853305625323, "learning_rate": 1.273771312215699e-08, "loss": 0.0, "num_tokens": 215196015.0, "reward": 0.8125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2651 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.619354838709677, "grad_norm": 0.0, "learning_rate": 1.2559315578201223e-08, "loss": 0.0, "num_tokens": 215271289.0, "reward": 1.0, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 1.0, "rewards/drgrpo_math_reward/std": 0.0, "step": 2652 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599796258942519e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.622222222222222, "grad_norm": 0.1426115443118561, "learning_rate": 1.2382168235379743e-08, "loss": -0.0, "num_tokens": 215341639.0, "reward": 0.921875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2653 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.625089605734767, "grad_norm": 7.876231650309869, "learning_rate": 1.2206271317979577e-08, "loss": 0.0, "num_tokens": 215420069.0, "reward": 0.875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2654 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.627956989247312, "grad_norm": 0.07476541749539088, "learning_rate": 1.203162504870414e-08, "loss": 0.0, "num_tokens": 215496886.0, "reward": 0.953125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 2655 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.299880526045478e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.630824372759856, "grad_norm": 0.13475767375141265, "learning_rate": 1.1858229648673446e-08, "loss": 0.0, "num_tokens": 215578056.0, "reward": 0.8828125, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.322907418012619, "step": 2656 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125782003796406e-09, "advantages/std": 0.5227903723716736, "advantages/var": 0.2733097734445131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 7.633691756272402, "grad_norm": 0.1014995301777789, "learning_rate": 1.168608533742399e-08, "loss": 0.0, "num_tokens": 215665608.0, "reward": 0.84375, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2657 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.1383733037079748e-08, "advantages/std": 0.5726819038391113, "advantages/var": 0.32796456298478915, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.636559139784946, "grad_norm": 0.12712537844489796, "learning_rate": 1.1515192332907875e-08, "loss": -0.0, "num_tokens": 215742089.0, "reward": 0.8515625, "reward_std": 0.15756267309188843, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2658 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199591840825068e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.639426523297491, "grad_norm": 0.11537309470133952, "learning_rate": 1.1345550851493469e-08, "loss": 0.0, "num_tokens": 215819150.0, "reward": 0.96875, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 2659 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983562397524497e-09, "advantages/std": 0.4675827622413635, "advantages/var": 0.2186336395452635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.6422939068100355, "grad_norm": 0.09980575467717229, "learning_rate": 1.1177161107964184e-08, "loss": 0.0, "num_tokens": 215898070.0, "reward": 0.9140625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2660 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.63344972278721e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.645161290322581, "grad_norm": 0.08570709282552061, "learning_rate": 1.1010023315518591e-08, "loss": 0.0, "num_tokens": 215979536.0, "reward": 0.96875, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 2661 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 7.6480286738351255, "grad_norm": 0.0, "learning_rate": 1.0844137685770194e-08, "loss": 0.0, "num_tokens": 216047337.0, "reward": 0.8125, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2662 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344503462080032e-09, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.65089605734767, "grad_norm": 0.13974908876271358, "learning_rate": 1.0679504428747543e-08, "loss": 0.0, "num_tokens": 216133284.0, "reward": 0.7734375, "reward_std": 0.12073517590761185, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 2663 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.63344972278721e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.6537634408602155, "grad_norm": 0.06524778747399308, "learning_rate": 1.0516123752893013e-08, "loss": 0.0, "num_tokens": 216209635.0, "reward": 0.90625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2664 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.749701315113695e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.65663082437276, "grad_norm": 0.1392669636215334, "learning_rate": 1.0353995865063137e-08, "loss": -0.0, "num_tokens": 216289854.0, "reward": 0.7265625, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 2665 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.659498207885305, "grad_norm": 0.07184046073763721, "learning_rate": 1.0193120970528602e-08, "loss": -0.0, "num_tokens": 216380214.0, "reward": 0.734375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44340085983276367, "step": 2666 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 7.662365591397849, "grad_norm": 0.08833598069209586, "learning_rate": 1.00334992729737e-08, "loss": 0.0, "num_tokens": 216461696.0, "reward": 0.8125, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2667 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.3798774372216438e-08, "advantages/std": 0.4049588143825531, "advantages/var": 0.1639916413461231, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.665232974910394, "grad_norm": 0.08168051975988941, "learning_rate": 9.87513097449555e-09, "loss": -0.0, "num_tokens": 216543066.0, "reward": 0.890625, "reward_std": 0.0867956355214119, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2668 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.9752099207640785e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 7.668100358422939, "grad_norm": 0.10562166931093338, "learning_rate": 9.718016275604756e-09, "loss": -0.0, "num_tokens": 216632354.0, "reward": 0.7265625, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 2669 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9835626514248234e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.670967741935484, "grad_norm": 0.17415648960053537, "learning_rate": 9.562155375224756e-09, "loss": 0.0, "num_tokens": 216714101.0, "reward": 0.8984375, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2670 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.8996420859237135e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.673835125448028, "grad_norm": 0.13275102999138666, "learning_rate": 9.407548470691251e-09, "loss": 0.0, "num_tokens": 216789143.0, "reward": 0.9609375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2671 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.2674839170101503e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.676702508960574, "grad_norm": 0.09636463428576127, "learning_rate": 9.254195757752547e-09, "loss": 0.0, "num_tokens": 216869537.0, "reward": 0.8359375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2672 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.679569892473118, "grad_norm": 0.0389602525214859, "learning_rate": 9.102097430568889e-09, "loss": 0.0, "num_tokens": 216937156.0, "reward": 0.9921875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9921875, "rewards/drgrpo_math_reward/std": 0.0883883461356163, "step": 2673 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.682437275985663, "grad_norm": 0.10580992641584908, "learning_rate": 8.951253681712234e-09, "loss": 0.0, "num_tokens": 217018393.0, "reward": 0.8671875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2674 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.022598620254011e-09, "advantages/std": 0.6185519695281982, "advantages/var": 0.3826065390072131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.6853046594982075, "grad_norm": 0.13720080016998606, "learning_rate": 8.801664702166367e-09, "loss": 0.0, "num_tokens": 217102988.0, "reward": 0.921875, "reward_std": 0.1659901738166809, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2675 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.688172043010753, "grad_norm": 0.07001202663451297, "learning_rate": 8.653330681326232e-09, "loss": 0.0, "num_tokens": 217180704.0, "reward": 0.859375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2676 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.6720844723812904e-09, "advantages/std": 0.5228067636489868, "advantages/var": 0.27332691211712756, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.6910394265232975, "grad_norm": 0.17707506611321436, "learning_rate": 8.506251806997932e-09, "loss": 0.0, "num_tokens": 217257453.0, "reward": 0.8046875, "reward_std": 0.15490421652793884, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2677 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199591840825068e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.693906810035842, "grad_norm": 0.09510770528265325, "learning_rate": 8.3604282653984e-09, "loss": 0.0, "num_tokens": 217334152.0, "reward": 0.90625, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2678 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814751550759118e-09, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.6967741935483875, "grad_norm": 0.1117515767481919, "learning_rate": 8.215860241155058e-09, "loss": 0.0, "num_tokens": 217427153.0, "reward": 0.8046875, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2679 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907256955369e-09, "advantages/std": 0.5227886438369751, "advantages/var": 0.2733079661249036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 7.699641577060932, "grad_norm": 0.09211896656131062, "learning_rate": 8.072547917305939e-09, "loss": -0.0, "num_tokens": 217508592.0, "reward": 0.78125, "reward_std": 0.1354655921459198, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 2680 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 7.702508960573477, "grad_norm": 0.07907253225985557, "learning_rate": 7.930491475299227e-09, "loss": 0.0, "num_tokens": 217598379.0, "reward": 0.90625, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2681 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.8996420859237135e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.705376344086021, "grad_norm": 0.09680865606183092, "learning_rate": 7.789691094992834e-09, "loss": 0.0, "num_tokens": 217665828.0, "reward": 0.9609375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2682 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.708243727598567, "grad_norm": 0.07900924007013105, "learning_rate": 7.650146954654491e-09, "loss": 0.0, "num_tokens": 217735925.0, "reward": 0.78125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 2683 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.4082993428404723e-08, "advantages/std": 0.330655038356781, "advantages/var": 0.10933275439072432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.711111111111111, "grad_norm": 0.08300974944848195, "learning_rate": 7.511859230961315e-09, "loss": 0.0, "num_tokens": 217814333.0, "reward": 0.890625, "reward_std": 0.0646936446428299, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2684 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.0954489382432772e-08, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.713978494623656, "grad_norm": 0.08399229608988694, "learning_rate": 7.37482809900003e-09, "loss": 0.0, "num_tokens": 217885789.0, "reward": 0.9375, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2685 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.7168458781362, "grad_norm": 0.06226440592484084, "learning_rate": 7.239053732265743e-09, "loss": 0.0, "num_tokens": 217958104.0, "reward": 0.859375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2686 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633180108710322e-09, "advantages/std": 0.3306560516357422, "advantages/var": 0.1093334244833386, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.719713261648746, "grad_norm": 0.1327203335786122, "learning_rate": 7.104536302662833e-09, "loss": 0.0, "num_tokens": 218031770.0, "reward": 0.8203125, "reward_std": 0.0657544732093811, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2687 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 8.049325332145131e-09, "advantages/std": 0.40495678782463074, "advantages/var": 0.163990000005243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.72258064516129, "grad_norm": 0.08045891667395001, "learning_rate": 6.971275980504176e-09, "loss": 0.0, "num_tokens": 218115744.0, "reward": 0.75, "reward_std": 0.08785156160593033, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.434714138507843, "step": 2688 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 8.016351208262037e-09, "advantages/std": 0.5228003859519958, "advantages/var": 0.2733202435515558, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.734375, "epoch": 7.725448028673835, "grad_norm": 0.20407715647000374, "learning_rate": 6.8392729345111425e-09, "loss": 0.0, "num_tokens": 218200423.0, "reward": 0.8046875, "reward_std": 0.14913025498390198, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2689 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5629072505384383e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.7283154121863795, "grad_norm": 0.11691903434726374, "learning_rate": 6.7085273318128185e-09, "loss": 0.0, "num_tokens": 218278595.0, "reward": 0.9296875, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2690 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.731182795698925, "grad_norm": 0.059861063471256346, "learning_rate": 6.5790393379467905e-09, "loss": 0.0, "num_tokens": 218361352.0, "reward": 0.9375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2691 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9835626514248234e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.7340501792114695, "grad_norm": 0.09439922768306205, "learning_rate": 6.450809116858136e-09, "loss": 0.0, "num_tokens": 218444512.0, "reward": 0.9609375, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2692 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125678014490734e-09, "advantages/std": 0.5227980017662048, "advantages/var": 0.2733177506507367, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.736917562724014, "grad_norm": 0.12924627370163774, "learning_rate": 6.32383683089932e-09, "loss": -0.0, "num_tokens": 218528346.0, "reward": 0.90625, "reward_std": 0.14571532607078552, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2693 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814379860189666e-09, "advantages/std": 0.5227925777435303, "advantages/var": 0.27331207934372515, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.7397849462365595, "grad_norm": 0.16310779641981915, "learning_rate": 6.1981226408303056e-09, "loss": 0.0, "num_tokens": 218613416.0, "reward": 0.8046875, "reward_std": 0.13782459497451782, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2694 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.742652329749104, "grad_norm": 0.0, "learning_rate": 6.073666705818104e-09, "loss": 0.0, "num_tokens": 218688676.0, "reward": 0.875, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2695 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 6.971027590413914e-09, "advantages/std": 0.4675966203212738, "advantages/var": 0.2186465993358775, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.745519713261649, "grad_norm": 0.09387237257468709, "learning_rate": 5.9504691834368905e-09, "loss": 0.0, "num_tokens": 218784615.0, "reward": 0.9140625, "reward_std": 0.10994865000247955, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2696 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.4082993428404723e-08, "advantages/std": 0.330655038356781, "advantages/var": 0.10933275439072432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.748387096774193, "grad_norm": 0.07097908960534689, "learning_rate": 5.828530229667228e-09, "loss": 0.0, "num_tokens": 218849472.0, "reward": 0.953125, "reward_std": 0.0646936446428299, "rewards/drgrpo_math_reward/mean": 0.953125, "rewards/drgrpo_math_reward/std": 0.21220162510871887, "step": 2697 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1499234216592094e-08, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.751254480286739, "grad_norm": 0.1148408687090542, "learning_rate": 5.7078499988961745e-09, "loss": 0.0, "num_tokens": 218925077.0, "reward": 0.8359375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2698 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.754121863799283, "grad_norm": 0.14558999467538067, "learning_rate": 5.588428643917509e-09, "loss": 0.0, "num_tokens": 219011215.0, "reward": 0.9140625, "reward_std": 0.12415502220392227, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2699 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.756989247311828, "grad_norm": 0.05451999267912209, "learning_rate": 5.4702663159308385e-09, "loss": 0.0, "num_tokens": 219088852.0, "reward": 0.8671875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2700 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167162292944866e-09, "advantages/std": 0.3306412398815155, "advantages/var": 0.10932362951038588, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.759856630824372, "grad_norm": 0.0705969087857691, "learning_rate": 5.353363164541824e-09, "loss": -0.0, "num_tokens": 219173993.0, "reward": 0.7265625, "reward_std": 0.05550473928451538, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 2701 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.762724014336918, "grad_norm": 0.11894700660589524, "learning_rate": 5.2377193377617365e-09, "loss": 0.0, "num_tokens": 219249412.0, "reward": 0.8046875, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2702 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.765591397849462, "grad_norm": 0.11047274624979665, "learning_rate": 5.123334982007566e-09, "loss": 0.0, "num_tokens": 219323901.0, "reward": 0.90625, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2703 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.4497441148988883e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.768458781362007, "grad_norm": 0.11190909380506751, "learning_rate": 5.0102102421016865e-09, "loss": 0.0, "num_tokens": 219407707.0, "reward": 0.796875, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2704 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.7713261648745515, "grad_norm": 0.040562304752599074, "learning_rate": 4.8983452612715306e-09, "loss": -0.0, "num_tokens": 219471542.0, "reward": 0.8203125, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2705 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125773067096241e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.774193548387097, "grad_norm": 0.14838139256253263, "learning_rate": 4.78774018114958e-09, "loss": 0.0, "num_tokens": 219562303.0, "reward": 0.7265625, "reward_std": 0.13888053596019745, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.447474867105484, "step": 2706 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917811987622486e-09, "advantages/std": 0.4675827622413635, "advantages/var": 0.2186336395452635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.7770609318996415, "grad_norm": 0.13831066794407268, "learning_rate": 4.678395141773373e-09, "loss": 0.0, "num_tokens": 219653895.0, "reward": 0.6953125, "reward_std": 0.09522313624620438, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 2707 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.975244195968941e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.779928315412186, "grad_norm": 0.16448989630108082, "learning_rate": 4.570310281584832e-09, "loss": 0.0, "num_tokens": 219723862.0, "reward": 0.8125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2708 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.7827956989247316, "grad_norm": 0.0648672440595542, "learning_rate": 4.463485737430605e-09, "loss": 0.0, "num_tokens": 219806311.0, "reward": 0.890625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2709 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299897960206267e-09, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.785663082437276, "grad_norm": 0.10196929770514401, "learning_rate": 4.35792164456128e-09, "loss": 0.0, "num_tokens": 219883002.0, "reward": 0.90625, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2710 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.788530465949821, "grad_norm": 0.06411780572392889, "learning_rate": 4.253618136631943e-09, "loss": -0.0, "num_tokens": 219954027.0, "reward": 0.8671875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2711 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8167571052905777e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 7.791397849462365, "grad_norm": 0.0983928491598763, "learning_rate": 4.1505753457016235e-09, "loss": 0.0, "num_tokens": 220026131.0, "reward": 0.8203125, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2712 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 7.52825480422812e-09, "advantages/std": 0.6185514330863953, "advantages/var": 0.3826058753732333, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.794265232974911, "grad_norm": 0.1578133285862227, "learning_rate": 4.048793402232853e-09, "loss": 0.0, "num_tokens": 220111872.0, "reward": 0.8671875, "reward_std": 0.1649293452501297, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2713 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494081377983093, "advantages/var": 0.1639770626646717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.797132616487455, "grad_norm": 0.06988750574363187, "learning_rate": 3.948272435092214e-09, "loss": 0.0, "num_tokens": 220184531.0, "reward": 0.9375, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2714 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.8, "grad_norm": 0.030883191530758667, "learning_rate": 3.849012571549348e-09, "loss": 0.0, "num_tokens": 220263468.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2715 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.802867383512545, "grad_norm": 0.08513554698850792, "learning_rate": 3.751013937277614e-09, "loss": 0.0, "num_tokens": 220333592.0, "reward": 0.90625, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2716 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.2649061754629257e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.80573476702509, "grad_norm": 0.12487730211503031, "learning_rate": 3.654276656353206e-09, "loss": 0.0, "num_tokens": 220411113.0, "reward": 0.828125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2717 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.808602150537634, "grad_norm": 0.06614608905441266, "learning_rate": 3.5588008512555944e-09, "loss": 0.0, "num_tokens": 220484064.0, "reward": 0.984375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 2718 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.811469534050179, "grad_norm": 0.08775363283175698, "learning_rate": 3.4645866428667514e-09, "loss": 0.0, "num_tokens": 220558740.0, "reward": 0.8046875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3979988098144531, "step": 2719 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.8143369175627235, "grad_norm": 0.0, "learning_rate": 3.371634150471481e-09, "loss": 0.0, "num_tokens": 220636615.0, "reward": 0.8125, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2720 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.149940263022739e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.817204301075269, "grad_norm": 0.16322981571413456, "learning_rate": 3.27994349175742e-09, "loss": -0.0, "num_tokens": 220702588.0, "reward": 0.9453125, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.9453125, "rewards/drgrpo_math_reward/std": 0.22826264798641205, "step": 2721 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.958626076587386e-10, "advantages/std": 0.4675959050655365, "advantages/var": 0.21864593043405822, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.8200716845878135, "grad_norm": 0.12377264896800805, "learning_rate": 3.189514782814151e-09, "loss": 0.0, "num_tokens": 220775512.0, "reward": 0.921875, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2722 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.252482966806137e-09, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.822939068100358, "grad_norm": 0.12787739498772854, "learning_rate": 3.1003481381337572e-09, "loss": 0.0, "num_tokens": 220861917.0, "reward": 0.765625, "reward_std": 0.16097760200500488, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 2723 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.198954094593892e-09, "advantages/std": 0.40496888756752014, "advantages/var": 0.16399979989767477, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.825806451612904, "grad_norm": 0.08716448625132536, "learning_rate": 3.0124436706102653e-09, "loss": 0.0, "num_tokens": 220944635.0, "reward": 0.859375, "reward_std": 0.09810129553079605, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2724 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.828673835125448, "grad_norm": 0.167880645885384, "learning_rate": 2.925801491539981e-09, "loss": 0.0, "num_tokens": 221009199.0, "reward": 0.984375, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.984375, "rewards/drgrpo_math_reward/std": 0.12450689822435379, "step": 2725 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675827622413635, "advantages/var": 0.2186336395452635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.831541218637993, "grad_norm": 0.1561044105138881, "learning_rate": 2.840421710620489e-09, "loss": 0.0, "num_tokens": 221083621.0, "reward": 0.8515625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2726 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899847745916856e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.834408602150537, "grad_norm": 0.10871773923395783, "learning_rate": 2.7563044359514286e-09, "loss": 0.0, "num_tokens": 221164548.0, "reward": 0.9140625, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2727 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1499235062879438e-08, "advantages/std": 0.40494978427886963, "advantages/var": 0.16398432778750305, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.837275985663083, "grad_norm": 0.10976454459404987, "learning_rate": 2.6734497740340533e-09, "loss": 0.0, "num_tokens": 221240488.0, "reward": 0.8984375, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2728 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 1.1499147049662961e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.840143369175627, "grad_norm": 0.08882144801910914, "learning_rate": 2.591857829770672e-09, "loss": 0.0, "num_tokens": 221325464.0, "reward": 0.765625, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42527204751968384, "step": 2729 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.2250872920904075e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.843010752688172, "grad_norm": 0.05628016315565577, "learning_rate": 2.5115287064650934e-09, "loss": 0.0, "num_tokens": 221400026.0, "reward": 0.796875, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2730 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975343977137235e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.845878136200717, "grad_norm": 0.11938439253889663, "learning_rate": 2.432462505822297e-09, "loss": 0.0, "num_tokens": 221481678.0, "reward": 0.8359375, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2731 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.848745519713262, "grad_norm": 0.04007472771262617, "learning_rate": 2.3546593279482053e-09, "loss": 0.0, "num_tokens": 221566319.0, "reward": 0.6796875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 2732 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344599133202926e-09, "advantages/std": 0.5227646827697754, "advantages/var": 0.2732829135513839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.851612903225806, "grad_norm": 0.10316750023043703, "learning_rate": 2.2781192713494656e-09, "loss": 0.0, "num_tokens": 221658000.0, "reward": 0.7109375, "reward_std": 0.11048543453216553, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45510825514793396, "step": 2733 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966992261291921e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.854480286738351, "grad_norm": 0.09879265918172515, "learning_rate": 2.2028424329337827e-09, "loss": -0.0, "num_tokens": 221737115.0, "reward": 0.90625, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2734 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.9875550720364307e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.857347670250896, "grad_norm": 0.23714559186754167, "learning_rate": 2.1288289080092504e-09, "loss": 0.0, "num_tokens": 221822242.0, "reward": 0.796875, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40390563011169434, "step": 2735 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.5995914574054795e-09, "advantages/std": 0.4049588143825531, "advantages/var": 0.1639916413461231, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 7.860215053763441, "grad_norm": 0.055693316196279354, "learning_rate": 2.056078790284688e-09, "loss": 0.0, "num_tokens": 221911649.0, "reward": 0.859375, "reward_std": 0.0867956355214119, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2736 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.219668799133311e-08, "advantages/std": 0.5726898312568665, "advantages/var": 0.3279736428250182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.8630824372759855, "grad_norm": 0.15205751802922915, "learning_rate": 1.9845921718690816e-09, "loss": 0.0, "num_tokens": 221992707.0, "reward": 0.84375, "reward_std": 0.16675157845020294, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2737 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.86594982078853, "grad_norm": 0.10385354678090679, "learning_rate": 1.914369143272032e-09, "loss": 0.0, "num_tokens": 222077741.0, "reward": 0.7578125, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4300905168056488, "step": 2738 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 9.757858184220564e-09, "advantages/std": 0.5726600289344788, "advantages/var": 0.32793950873923805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.868817204301076, "grad_norm": 0.184846041114866, "learning_rate": 1.8454097934027524e-09, "loss": 0.0, "num_tokens": 222149218.0, "reward": 0.890625, "reward_std": 0.13258251547813416, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2739 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.330655038356781, "advantages/var": 0.10933275439072432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.87168458781362, "grad_norm": 0.07811272576960604, "learning_rate": 1.7777142095711794e-09, "loss": -0.0, "num_tokens": 222238855.0, "reward": 0.8125, "reward_std": 0.0646936446428299, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2740 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983539800525091e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.874551971326165, "grad_norm": 0.11646064019977592, "learning_rate": 1.7112824774866419e-09, "loss": 0.0, "num_tokens": 222318739.0, "reward": 0.65625, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47682511806488037, "step": 2741 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.0, "advantages/var": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.877419354838709, "grad_norm": 0.0, "learning_rate": 1.6461146812586368e-09, "loss": 0.0, "num_tokens": 222387888.0, "reward": 0.9375, "reward_std": 0.0, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2742 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.880286738351255, "grad_norm": 0.07704070029133948, "learning_rate": 1.582210903396275e-09, "loss": -0.0, "num_tokens": 222458262.0, "reward": 0.9296875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2743 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199113184367294e-09, "advantages/std": 0.40496188402175903, "advantages/var": 0.16399412751045261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 7.883154121863799, "grad_norm": 0.20750762751742974, "learning_rate": 1.5195712248081693e-09, "loss": 0.0, "num_tokens": 222538911.0, "reward": 0.7890625, "reward_std": 0.09021057933568954, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2744 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.886021505376344, "grad_norm": 0.028746765543212663, "learning_rate": 1.4581957248026577e-09, "loss": 0.0, "num_tokens": 222614453.0, "reward": 0.90625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2745 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 7.966726021133282e-09, "advantages/std": 0.23380307853221893, "advantages/var": 0.054663879531142934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.888888888888889, "grad_norm": 0.08040992155965311, "learning_rate": 1.3980844810875803e-09, "loss": 0.0, "num_tokens": 222687585.0, "reward": 0.859375, "reward_std": 0.0289318785071373, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2746 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.453752885066694e-09, "advantages/std": 0.5227740406990051, "advantages/var": 0.27329269762876507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.891756272401434, "grad_norm": 0.10904768791813681, "learning_rate": 1.3392375697696134e-09, "loss": 0.0, "num_tokens": 222768456.0, "reward": 0.7734375, "reward_std": 0.12073516845703125, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4202519655227661, "step": 2747 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 5.63344972278721e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.894623655913978, "grad_norm": 0.056924319592008936, "learning_rate": 1.2816550653551584e-09, "loss": -0.0, "num_tokens": 222847091.0, "reward": 0.828125, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3787541687488556, "step": 2748 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814751550759118e-09, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.897491039426523, "grad_norm": 0.19184599630160612, "learning_rate": 1.2253370407495634e-09, "loss": 0.0, "num_tokens": 222921748.0, "reward": 0.9296875, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2749 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966992261291921e-09, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.900358422939068, "grad_norm": 0.08997440020834355, "learning_rate": 1.1702835672572353e-09, "loss": -0.0, "num_tokens": 223008554.0, "reward": 0.84375, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2750 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.041577316723057e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.903225806451613, "grad_norm": 0.11961462196368258, "learning_rate": 1.1164947145815285e-09, "loss": 0.0, "num_tokens": 223083316.0, "reward": 0.8984375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2751 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.9060931899641576, "grad_norm": 0.07615611470258046, "learning_rate": 1.0639705508245222e-09, "loss": 0.0, "num_tokens": 223162187.0, "reward": 0.96875, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 2752 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.907216333870301e-10, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "epoch": 7.908960573476703, "grad_norm": 0.11669161858992876, "learning_rate": 1.0127111424872436e-09, "loss": -0.0, "num_tokens": 223241428.0, "reward": 0.8203125, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.3854354918003082, "step": 2753 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.911827956989248, "grad_norm": 0.05064682095632566, "learning_rate": 9.62716554469445e-10, "loss": 0.0, "num_tokens": 223326818.0, "reward": 0.9765625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9765625, "rewards/drgrpo_math_reward/std": 0.15188287198543549, "step": 2754 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.914695340501792, "grad_norm": 0.05404498579841204, "learning_rate": 9.139868500693815e-10, "loss": 0.0, "num_tokens": 223399407.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2755 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.975176026781512e-09, "advantages/std": 0.4675958752632141, "advantages/var": 0.2186459025631713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.917562724014337, "grad_norm": 0.08613914332482507, "learning_rate": 8.665220909838123e-10, "loss": -0.0, "num_tokens": 223479721.0, "reward": 0.84375, "reward_std": 0.10888782143592834, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2756 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299949248638952e-09, "advantages/std": 0.40493178367614746, "advantages/var": 0.16396974943114628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.920430107526881, "grad_norm": 0.13985749059222663, "learning_rate": 8.203223373078883e-10, "loss": -0.0, "num_tokens": 223551462.0, "reward": 0.9296875, "reward_std": 0.06629125773906708, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.2566775679588318, "step": 2757 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 9.958180313570735e-09, "advantages/std": 0.23380841314792633, "advantages/var": 0.05466637405875141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "epoch": 7.923297491039427, "grad_norm": 0.028255021993964898, "learning_rate": 7.753876475353749e-10, "loss": -0.0, "num_tokens": 223625050.0, "reward": 0.9609375, "reward_std": 0.03234682232141495, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2758 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149905988405531e-09, "advantages/std": 0.4049559533596039, "advantages/var": 0.16398932416138567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.926164874551971, "grad_norm": 0.09122449924121048, "learning_rate": 7.317180785582078e-10, "loss": 0.0, "num_tokens": 223711809.0, "reward": 0.8671875, "reward_std": 0.08679073303937912, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2759 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599761052090956e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 7.929032258064516, "grad_norm": 0.11140787709211054, "learning_rate": 6.893136856664928e-10, "loss": 0.0, "num_tokens": 223790438.0, "reward": 0.7890625, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4095771610736847, "step": 2760 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.931899641577061, "grad_norm": 0.09015450308190896, "learning_rate": 6.481745225485057e-10, "loss": 0.0, "num_tokens": 223860406.0, "reward": 0.96875, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 2761 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.633702096822814e-09, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.934767025089606, "grad_norm": 0.1530075651630246, "learning_rate": 6.083006412906932e-10, "loss": 0.0, "num_tokens": 223932568.0, "reward": 0.921875, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.2694226801395416, "step": 2762 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 9.858537014877616e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.93763440860215, "grad_norm": 0.07419653303663103, "learning_rate": 5.696920923774496e-10, "loss": 0.0, "num_tokens": 224001398.0, "reward": 0.96875, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.96875, "rewards/drgrpo_math_reward/std": 0.1746762990951538, "step": 2763 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199094228701277e-09, "advantages/std": 0.4049627184867859, "advantages/var": 0.1639948033642078, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.940501792114695, "grad_norm": 0.07543987633431061, "learning_rate": 5.323489246911172e-10, "loss": -0.0, "num_tokens": 224078782.0, "reward": 0.875, "reward_std": 0.09127141535282135, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3320184051990509, "step": 2764 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.449892780067669e-09, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.94336917562724, "grad_norm": 0.07923271569372832, "learning_rate": 4.962711855120982e-10, "loss": 0.0, "num_tokens": 224158577.0, "reward": 0.8671875, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3407054841518402, "step": 2765 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 5.633244590331673e-09, "advantages/std": 0.33065226674079895, "advantages/var": 0.10933092150082846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.946236559139785, "grad_norm": 0.07055910941482718, "learning_rate": 4.614589205184094e-10, "loss": -0.0, "num_tokens": 224241620.0, "reward": 0.859375, "reward_std": 0.06233953312039375, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2766 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.2649061754629257e-08, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.94910394265233, "grad_norm": 0.07495124993441504, "learning_rate": 4.279121737859048e-10, "loss": 0.0, "num_tokens": 224322394.0, "reward": 0.890625, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31333550810813904, "step": 2767 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 1.126665165824288e-08, "advantages/std": 0.3306474983692169, "advantages/var": 0.1093277681778213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 7.951971326164875, "grad_norm": 0.060232234185240006, "learning_rate": 3.9563098778827576e-10, "loss": -0.0, "num_tokens": 224407186.0, "reward": 0.84375, "reward_std": 0.0578637570142746, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2768 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2525469477123842e-09, "advantages/std": 0.5726727843284607, "advantages/var": 0.32795411791051166, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "epoch": 7.95483870967742, "grad_norm": 0.14730117650532376, "learning_rate": 3.646154033968285e-10, "loss": 0.0, "num_tokens": 224481005.0, "reward": 0.90625, "reward_std": 0.1462520956993103, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29262590408325195, "step": 2769 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199592517885038e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.957706093189964, "grad_norm": 0.08582205000676567, "learning_rate": 3.3486545988048454e-10, "loss": 0.0, "num_tokens": 224565442.0, "reward": 0.84375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2770 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.2674839170101503e-08, "advantages/std": 0.33065125346183777, "advantages/var": 0.10933025141588448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "epoch": 7.960573476702509, "grad_norm": 0.050238328452193, "learning_rate": 3.063811949056694e-10, "loss": -0.0, "num_tokens": 224641292.0, "reward": 0.8984375, "reward_std": 0.061278700828552246, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.3032590448856354, "step": 2771 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 5.691787729288785e-09, "advantages/std": 0.5726898312568665, "advantages/var": 0.3279736428250182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.963440860215054, "grad_norm": 0.24453593361327636, "learning_rate": 2.791626445364237e-10, "loss": -0.0, "num_tokens": 224711573.0, "reward": 0.78125, "reward_std": 0.16675157845020294, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41502299904823303, "step": 2772 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344599133202926e-09, "advantages/std": 0.5227646827697754, "advantages/var": 0.2732829135513839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.966308243727599, "grad_norm": 0.15305083215579046, "learning_rate": 2.5320984323418113e-10, "loss": 0.0, "num_tokens": 224785946.0, "reward": 0.9609375, "reward_std": 0.11048543453216553, "rewards/drgrpo_math_reward/mean": 0.9609375, "rewards/drgrpo_math_reward/std": 0.194504976272583, "step": 2773 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975343977137235e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.969175627240143, "grad_norm": 0.12863254373966837, "learning_rate": 2.2852282385787957e-10, "loss": 0.0, "num_tokens": 224864554.0, "reward": 0.8515625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.356930136680603, "step": 2774 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.764007346427472e-09, "advantages/std": 0.6185711622238159, "advantages/var": 0.3826302827349224, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 7.972043010752688, "grad_norm": 0.13836384217215011, "learning_rate": 2.051016176637388e-10, "loss": -0.0, "num_tokens": 224957309.0, "reward": 0.6953125, "reward_std": 0.18884867429733276, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 2775 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917668534980524e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 7.974910394265233, "grad_norm": 0.10442698926745332, "learning_rate": 1.8294625430559373e-10, "loss": 0.0, "num_tokens": 225048398.0, "reward": 0.6953125, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.46208351850509644, "step": 2776 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 9.199522104181912e-09, "advantages/std": 0.40494388341903687, "advantages/var": 0.16397954871849052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "epoch": 7.977777777777778, "grad_norm": 0.10296584659946971, "learning_rate": 1.6205676183411732e-10, "loss": 0.0, "num_tokens": 225139410.0, "reward": 0.8359375, "reward_std": 0.0765409991145134, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.371787428855896, "step": 2777 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33062541484832764, "advantages/var": 0.10931316494362875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "epoch": 7.980645161290322, "grad_norm": 0.05437560348032493, "learning_rate": 1.4243316669781957e-10, "loss": 0.0, "num_tokens": 225224778.0, "reward": 0.8125, "reward_std": 0.04419417306780815, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39184603095054626, "step": 2778 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016207721177622e-09, "advantages/std": 0.5228097438812256, "advantages/var": 0.2733300282971527, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.796875, "epoch": 7.983512544802867, "grad_norm": 0.1326112289917123, "learning_rate": 1.240754937420485e-10, "loss": -0.0, "num_tokens": 225312787.0, "reward": 0.6796875, "reward_std": 0.1593799889087677, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4684300124645233, "step": 2779 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2998981294712596e-09, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 7.986379928315412, "grad_norm": 0.0786852035018988, "learning_rate": 1.0698376620954518e-10, "loss": 0.0, "num_tokens": 225392859.0, "reward": 0.9375, "reward_std": 0.07312605530023575, "rewards/drgrpo_math_reward/mean": 0.9375, "rewards/drgrpo_math_reward/std": 0.24301259219646454, "step": 2780 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4049447178840637, "advantages/var": 0.16398022454220396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "epoch": 7.989247311827957, "grad_norm": 0.06871610885006592, "learning_rate": 9.11580057402217e-11, "loss": -0.0, "num_tokens": 225476694.0, "reward": 0.84375, "reward_std": 0.0776018276810646, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3645188808441162, "step": 2781 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.63344972278721e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "epoch": 7.992114695340502, "grad_norm": 0.06211571730686199, "learning_rate": 7.659823237105013e-11, "loss": -0.0, "num_tokens": 225557155.0, "reward": 0.859375, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3490002751350403, "step": 2782 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.983628835318941e-09, "advantages/std": 0.23378747701644897, "advantages/var": 0.05465658440971666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.994982078853047, "grad_norm": 0.04752566512581673, "learning_rate": 6.330446453617356e-11, "loss": 0.0, "num_tokens": 225635250.0, "reward": 0.9921875, "reward_std": 0.022097086533904076, "rewards/drgrpo_math_reward/mean": 0.9921875, "rewards/drgrpo_math_reward/std": 0.0883883461356163, "step": 2783 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.450271315871733e-09, "advantages/std": 0.3306364417076111, "advantages/var": 0.1093204565850705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "epoch": 7.997849462365592, "grad_norm": 0.10108201830407647, "learning_rate": 5.127671906690612e-11, "loss": 0.0, "num_tokens": 225718466.0, "reward": 0.9140625, "reward_std": 0.05102896690368652, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.2813730239868164, "step": 2784 }, { "epoch": 7.997849462365592, "step": 2784, "total_flos": 0.0, "train_loss": 1.1265189547477097e-09, "train_runtime": 46427.9339, "train_samples_per_second": 0.961, "train_steps_per_second": 0.06 } ], "logging_steps": 1, "max_steps": 2792, "num_input_tokens_seen": 225718466, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }