{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.375, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.015625, "frac_reward_zero_std": 0.0, "grad_norm": 7.90625, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 25104.0, "reward": 0.38830292224884033, "reward_std": 0.0850929468870163, "rewards/grpo_reward_func/mean": 0.38830292224884033, "rewards/grpo_reward_func/std": 0.08739857375621796, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.03125, "frac_reward_zero_std": 0.0, "grad_norm": 12.875, "kl": 0.0, "learning_rate": 8.333333333333334e-09, "loss": -0.0, "num_tokens": 50064.0, "reward": 0.38852280378341675, "reward_std": 0.14650246500968933, "rewards/grpo_reward_func/mean": 0.38852280378341675, "rewards/grpo_reward_func/std": 0.1692417562007904, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.046875, "frac_reward_zero_std": 0.0, "grad_norm": 11.75, "kl": 0.0006874206592328846, "learning_rate": 1.6666666666666667e-08, "loss": 0.0, "num_tokens": 75464.0, "reward": 0.2590749263763428, "reward_std": 0.11121661216020584, "rewards/grpo_reward_func/mean": 0.2590749263763428, "rewards/grpo_reward_func/std": 0.1687185913324356, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0625, "frac_reward_zero_std": 0.0, "grad_norm": 10.25, "kl": 0.0003259268924011849, "learning_rate": 2.5e-08, "loss": 0.0, "num_tokens": 100632.0, "reward": 0.3599267899990082, "reward_std": 0.1382569521665573, "rewards/grpo_reward_func/mean": 0.3599267899990082, "rewards/grpo_reward_func/std": 0.15764164924621582, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.078125, "frac_reward_zero_std": 0.0, "grad_norm": 9.4375, "kl": 0.00010777699208119884, "learning_rate": 3.3333333333333334e-08, "loss": 0.0, "num_tokens": 125584.0, "reward": 0.4119647145271301, "reward_std": 0.10570582747459412, "rewards/grpo_reward_func/mean": 0.4119647145271301, "rewards/grpo_reward_func/std": 0.14051103591918945, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.09375, "frac_reward_zero_std": 0.0, "grad_norm": 10.0, "kl": 0.0003376482127350755, "learning_rate": 4.166666666666666e-08, "loss": 0.0, "num_tokens": 150800.0, "reward": 0.2505754232406616, "reward_std": 0.1554991602897644, "rewards/grpo_reward_func/mean": 0.2505754232406616, "rewards/grpo_reward_func/std": 0.15568533539772034, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.109375, "frac_reward_zero_std": 0.0, "grad_norm": 10.3125, "kl": 0.00025600909430067986, "learning_rate": 5e-08, "loss": 0.0, "num_tokens": 176576.0, "reward": 0.06775141507387161, "reward_std": 0.14545553922653198, "rewards/grpo_reward_func/mean": 0.06775141507387161, "rewards/grpo_reward_func/std": 0.16305957734584808, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.125, "frac_reward_zero_std": 0.0, "grad_norm": 10.0625, "kl": 0.00029405950772343203, "learning_rate": 5.833333333333333e-08, "loss": 0.0, "num_tokens": 202072.0, "reward": 0.11841653287410736, "reward_std": 0.0738542377948761, "rewards/grpo_reward_func/mean": 0.11841653287410736, "rewards/grpo_reward_func/std": 0.07645151019096375, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.140625, "frac_reward_zero_std": 0.0, "grad_norm": 11.4375, "kl": 0.0002696753217605874, "learning_rate": 6.666666666666667e-08, "loss": 0.0, "num_tokens": 227912.0, "reward": 0.10485261678695679, "reward_std": 0.09079232811927795, "rewards/grpo_reward_func/mean": 0.10485261678695679, "rewards/grpo_reward_func/std": 0.08799877762794495, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.15625, "frac_reward_zero_std": 0.0, "grad_norm": 19.125, "kl": 0.0005563851591432467, "learning_rate": 7.5e-08, "loss": 0.0, "num_tokens": 252904.0, "reward": 0.3231259882450104, "reward_std": 0.16560493409633636, "rewards/grpo_reward_func/mean": 0.3231259882450104, "rewards/grpo_reward_func/std": 0.1688951849937439, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.171875, "frac_reward_zero_std": 0.0, "grad_norm": 8.125, "kl": 7.444247421517503e-05, "learning_rate": 8.333333333333333e-08, "loss": 0.0, "num_tokens": 278576.0, "reward": 0.214687317609787, "reward_std": 0.0769403874874115, "rewards/grpo_reward_func/mean": 0.214687317609787, "rewards/grpo_reward_func/std": 0.2403011918067932, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1875, "frac_reward_zero_std": 0.0, "grad_norm": 8.8125, "kl": 0.00032506883871974424, "learning_rate": 9.166666666666665e-08, "loss": 0.0, "num_tokens": 303920.0, "reward": 0.20663060247898102, "reward_std": 0.10999321937561035, "rewards/grpo_reward_func/mean": 0.20663060247898102, "rewards/grpo_reward_func/std": 0.15555834770202637, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.203125, "frac_reward_zero_std": 0.0, "grad_norm": 12.1875, "kl": 0.00023091987532097846, "learning_rate": 1e-07, "loss": 0.0, "num_tokens": 329024.0, "reward": 0.36360427737236023, "reward_std": 0.15357878804206848, "rewards/grpo_reward_func/mean": 0.36360427737236023, "rewards/grpo_reward_func/std": 0.22707244753837585, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.21875, "frac_reward_zero_std": 0.0, "grad_norm": 10.6875, "kl": 0.00046382473010453396, "learning_rate": 1.0833333333333334e-07, "loss": 0.0, "num_tokens": 353760.0, "reward": 0.3822150230407715, "reward_std": 0.11560969054698944, "rewards/grpo_reward_func/mean": 0.3822150230407715, "rewards/grpo_reward_func/std": 0.11733639240264893, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.234375, "frac_reward_zero_std": 0.0, "grad_norm": 11.0, "kl": 0.00023804418742656708, "learning_rate": 1.1666666666666667e-07, "loss": 0.0, "num_tokens": 379000.0, "reward": 0.17802061140537262, "reward_std": 0.08576580137014389, "rewards/grpo_reward_func/mean": 0.17802061140537262, "rewards/grpo_reward_func/std": 0.1571587771177292, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.25, "frac_reward_zero_std": 0.0, "grad_norm": 10.1875, "kl": 9.951802576324553e-05, "learning_rate": 1.25e-07, "loss": 0.0, "num_tokens": 404232.0, "reward": 0.1350761502981186, "reward_std": 0.07875347137451172, "rewards/grpo_reward_func/mean": 0.1350761502981186, "rewards/grpo_reward_func/std": 0.14856529235839844, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.265625, "frac_reward_zero_std": 0.0, "grad_norm": 15.9375, "kl": 0.0003588832914829254, "learning_rate": 1.3333333333333334e-07, "loss": 0.0, "num_tokens": 429392.0, "reward": 0.25521713495254517, "reward_std": 0.17234337329864502, "rewards/grpo_reward_func/mean": 0.25521713495254517, "rewards/grpo_reward_func/std": 0.24440135061740875, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.28125, "frac_reward_zero_std": 0.0, "grad_norm": 11.625, "kl": 0.000218976605538046, "learning_rate": 1.4166666666666665e-07, "loss": 0.0, "num_tokens": 454352.0, "reward": 0.3575406074523926, "reward_std": 0.1384538859128952, "rewards/grpo_reward_func/mean": 0.3575406074523926, "rewards/grpo_reward_func/std": 0.13475467264652252, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.296875, "frac_reward_zero_std": 0.0, "grad_norm": 10.4375, "kl": 0.00021025817841291428, "learning_rate": 1.5e-07, "loss": 0.0, "num_tokens": 480208.0, "reward": 0.09945888817310333, "reward_std": 0.09086775779724121, "rewards/grpo_reward_func/mean": 0.09945888817310333, "rewards/grpo_reward_func/std": 0.10034166276454926, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.3125, "frac_reward_zero_std": 0.0, "grad_norm": 18.625, "kl": 0.00023360302293440327, "learning_rate": 1.583333333333333e-07, "loss": 0.0, "num_tokens": 506432.0, "reward": 0.08866722881793976, "reward_std": 0.11887718737125397, "rewards/grpo_reward_func/mean": 0.08866722881793976, "rewards/grpo_reward_func/std": 0.12845923006534576, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.328125, "frac_reward_zero_std": 0.0, "grad_norm": 17.625, "kl": 0.0002354470343561843, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "num_tokens": 532456.0, "reward": 0.06954602897167206, "reward_std": 0.11671130359172821, "rewards/grpo_reward_func/mean": 0.06954602897167206, "rewards/grpo_reward_func/std": 0.11377867311239243, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.34375, "frac_reward_zero_std": 0.0, "grad_norm": 7.0, "kl": 0.0003678910434246063, "learning_rate": 1.75e-07, "loss": 0.0, "num_tokens": 557416.0, "reward": 0.31994929909706116, "reward_std": 0.11059385538101196, "rewards/grpo_reward_func/mean": 0.31994929909706116, "rewards/grpo_reward_func/std": 0.1296655386686325, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.359375, "frac_reward_zero_std": 0.0, "grad_norm": 9.5, "kl": 0.00020401241636136547, "learning_rate": 1.833333333333333e-07, "loss": 0.0, "num_tokens": 582720.0, "reward": 0.3206818401813507, "reward_std": 0.10244449228048325, "rewards/grpo_reward_func/mean": 0.3206818401813507, "rewards/grpo_reward_func/std": 0.17895404994487762, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.375, "frac_reward_zero_std": 0.0, "grad_norm": 14.8125, "kl": 0.00020465596026042476, "learning_rate": 1.9166666666666668e-07, "loss": 0.0, "num_tokens": 608040.0, "reward": 0.43054676055908203, "reward_std": 0.18113180994987488, "rewards/grpo_reward_func/mean": 0.43054676055908203, "rewards/grpo_reward_func/std": 0.2842019498348236, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.390625, "frac_reward_zero_std": 0.0, "grad_norm": 11.6875, "kl": 0.00022962906950851902, "learning_rate": 2e-07, "loss": 0.0, "num_tokens": 633024.0, "reward": 0.28427720069885254, "reward_std": 0.099380724132061, "rewards/grpo_reward_func/mean": 0.28427720069885254, "rewards/grpo_reward_func/std": 0.11085890978574753, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.40625, "frac_reward_zero_std": 0.0, "grad_norm": 14.4375, "kl": 0.00015032757073640823, "learning_rate": 2.0833333333333333e-07, "loss": 0.0, "num_tokens": 657928.0, "reward": 0.3395322561264038, "reward_std": 0.09541542828083038, "rewards/grpo_reward_func/mean": 0.3395322561264038, "rewards/grpo_reward_func/std": 0.11023792624473572, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.421875, "frac_reward_zero_std": 0.0, "grad_norm": 16.875, "kl": 8.121753853629343e-05, "learning_rate": 2.1666666666666667e-07, "loss": 0.0, "num_tokens": 682808.0, "reward": 0.26600906252861023, "reward_std": 0.08304192125797272, "rewards/grpo_reward_func/mean": 0.26600906252861023, "rewards/grpo_reward_func/std": 0.09309838712215424, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.4375, "frac_reward_zero_std": 0.0, "grad_norm": 8.5, "kl": 0.00022083613293943927, "learning_rate": 2.25e-07, "loss": 0.0, "num_tokens": 708184.0, "reward": 0.23112675547599792, "reward_std": 0.08039335906505585, "rewards/grpo_reward_func/mean": 0.23112675547599792, "rewards/grpo_reward_func/std": 0.20841825008392334, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.453125, "frac_reward_zero_std": 0.0, "grad_norm": 10.25, "kl": 0.0002448335289955139, "learning_rate": 2.3333333333333333e-07, "loss": 0.0, "num_tokens": 733168.0, "reward": 0.3006294071674347, "reward_std": 0.13648909330368042, "rewards/grpo_reward_func/mean": 0.3006294071674347, "rewards/grpo_reward_func/std": 0.1425219625234604, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.46875, "frac_reward_zero_std": 0.0, "grad_norm": 15.5625, "kl": 0.0002552429141360335, "learning_rate": 2.4166666666666665e-07, "loss": 0.0, "num_tokens": 758064.0, "reward": 0.26793670654296875, "reward_std": 0.12509800493717194, "rewards/grpo_reward_func/mean": 0.26793670654296875, "rewards/grpo_reward_func/std": 0.15831595659255981, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.484375, "frac_reward_zero_std": 0.0, "grad_norm": 16.5, "kl": 0.00015467405319213867, "learning_rate": 2.5e-07, "loss": 0.0, "num_tokens": 783520.0, "reward": 0.07176833599805832, "reward_std": 0.12234357744455338, "rewards/grpo_reward_func/mean": 0.07176833599805832, "rewards/grpo_reward_func/std": 0.12346605211496353, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 11.625, "kl": 0.0002303921282873489, "learning_rate": 2.5833333333333333e-07, "loss": 0.0, "num_tokens": 809312.0, "reward": 0.04103652387857437, "reward_std": 0.08447092771530151, "rewards/grpo_reward_func/mean": 0.04103652387857437, "rewards/grpo_reward_func/std": 0.08558139950037003, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.515625, "frac_reward_zero_std": 0.0, "grad_norm": 10.3125, "kl": 0.00012425271415850148, "learning_rate": 2.6666666666666667e-07, "loss": 0.0, "num_tokens": 834584.0, "reward": 0.24062111973762512, "reward_std": 0.11993544548749924, "rewards/grpo_reward_func/mean": 0.24062111973762512, "rewards/grpo_reward_func/std": 0.1465650498867035, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.53125, "frac_reward_zero_std": 0.0, "grad_norm": 10.3125, "kl": 0.0003296689537819475, "learning_rate": 2.75e-07, "loss": 0.0, "num_tokens": 859728.0, "reward": 0.31566762924194336, "reward_std": 0.15053875744342804, "rewards/grpo_reward_func/mean": 0.31566762924194336, "rewards/grpo_reward_func/std": 0.17469221353530884, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.546875, "frac_reward_zero_std": 0.0, "grad_norm": 13.625, "kl": 0.00011381755030015483, "learning_rate": 2.833333333333333e-07, "loss": 0.0, "num_tokens": 884728.0, "reward": 0.34107768535614014, "reward_std": 0.14179188013076782, "rewards/grpo_reward_func/mean": 0.34107768535614014, "rewards/grpo_reward_func/std": 0.14815190434455872, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.5625, "frac_reward_zero_std": 0.0, "grad_norm": 8.125, "kl": 0.00012111260366509669, "learning_rate": 2.916666666666667e-07, "loss": 0.0, "num_tokens": 909552.0, "reward": 0.454355925321579, "reward_std": 0.14741826057434082, "rewards/grpo_reward_func/mean": 0.454355925321579, "rewards/grpo_reward_func/std": 0.14277252554893494, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.578125, "frac_reward_zero_std": 0.0, "grad_norm": 11.6875, "kl": 0.00044129292655270547, "learning_rate": 3e-07, "loss": 0.0, "num_tokens": 934856.0, "reward": 0.23417434096336365, "reward_std": 0.07929708808660507, "rewards/grpo_reward_func/mean": 0.23417434096336365, "rewards/grpo_reward_func/std": 0.11618053168058395, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.59375, "frac_reward_zero_std": 0.0, "grad_norm": 11.8125, "kl": 0.000225259609578643, "learning_rate": 3.0833333333333333e-07, "loss": 0.0, "num_tokens": 960368.0, "reward": 0.22501924633979797, "reward_std": 0.1290852576494217, "rewards/grpo_reward_func/mean": 0.22501924633979797, "rewards/grpo_reward_func/std": 0.23068629205226898, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.609375, "frac_reward_zero_std": 0.0, "grad_norm": 13.0, "kl": 0.00030248487746575847, "learning_rate": 3.166666666666666e-07, "loss": 0.0, "num_tokens": 985632.0, "reward": 0.3790042996406555, "reward_std": 0.16399240493774414, "rewards/grpo_reward_func/mean": 0.3790042996406555, "rewards/grpo_reward_func/std": 0.20435301959514618, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.625, "frac_reward_zero_std": 0.0, "grad_norm": 11.0, "kl": 0.0002346886321902275, "learning_rate": 3.25e-07, "loss": 0.0, "num_tokens": 1010840.0, "reward": 0.3319600224494934, "reward_std": 0.08543172478675842, "rewards/grpo_reward_func/mean": 0.3319600224494934, "rewards/grpo_reward_func/std": 0.11251801252365112, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.640625, "frac_reward_zero_std": 0.0, "grad_norm": 13.9375, "kl": 0.00011254071068833582, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "num_tokens": 1035528.0, "reward": 0.5335917472839355, "reward_std": 0.09484530240297318, "rewards/grpo_reward_func/mean": 0.5335917472839355, "rewards/grpo_reward_func/std": 0.12994690239429474, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.65625, "frac_reward_zero_std": 0.0, "grad_norm": 9.0625, "kl": 0.0003100304602412507, "learning_rate": 3.4166666666666664e-07, "loss": 0.0, "num_tokens": 1061056.0, "reward": 0.2819344103336334, "reward_std": 0.09613234549760818, "rewards/grpo_reward_func/mean": 0.2819344103336334, "rewards/grpo_reward_func/std": 0.17606547474861145, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.671875, "frac_reward_zero_std": 0.0, "grad_norm": 16.25, "kl": 0.00022800049191573635, "learning_rate": 3.5e-07, "loss": 0.0, "num_tokens": 1085800.0, "reward": 0.3839090168476105, "reward_std": 0.1359642744064331, "rewards/grpo_reward_func/mean": 0.3839090168476105, "rewards/grpo_reward_func/std": 0.14048616588115692, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.6875, "frac_reward_zero_std": 0.0, "grad_norm": 7.8125, "kl": 0.0002025471330853179, "learning_rate": 3.583333333333333e-07, "loss": 0.0, "num_tokens": 1110848.0, "reward": 0.30948373675346375, "reward_std": 0.06383931636810303, "rewards/grpo_reward_func/mean": 0.30948373675346375, "rewards/grpo_reward_func/std": 0.07188048213720322, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.703125, "frac_reward_zero_std": 0.0, "grad_norm": 8.25, "kl": 0.00023324073845287785, "learning_rate": 3.666666666666666e-07, "loss": 0.0, "num_tokens": 1136088.0, "reward": 0.3444882035255432, "reward_std": 0.09875836968421936, "rewards/grpo_reward_func/mean": 0.3444882035255432, "rewards/grpo_reward_func/std": 0.1097148060798645, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.71875, "frac_reward_zero_std": 0.0, "grad_norm": 7.5, "kl": 0.00025422839826205745, "learning_rate": 3.75e-07, "loss": 0.0, "num_tokens": 1161032.0, "reward": 0.2477511316537857, "reward_std": 0.1270497590303421, "rewards/grpo_reward_func/mean": 0.2477511316537857, "rewards/grpo_reward_func/std": 0.15981581807136536, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.734375, "frac_reward_zero_std": 0.0, "grad_norm": 10.5, "kl": 0.00021969123918097466, "learning_rate": 3.8333333333333335e-07, "loss": 0.0, "num_tokens": 1185888.0, "reward": 0.46379101276397705, "reward_std": 0.14611366391181946, "rewards/grpo_reward_func/mean": 0.46379101276397705, "rewards/grpo_reward_func/std": 0.16491258144378662, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.75, "frac_reward_zero_std": 0.0, "grad_norm": 12.8125, "kl": 0.0004819600435439497, "learning_rate": 3.9166666666666664e-07, "loss": 0.0, "num_tokens": 1211400.0, "reward": 0.23074069619178772, "reward_std": 0.14558511972427368, "rewards/grpo_reward_func/mean": 0.23074069619178772, "rewards/grpo_reward_func/std": 0.2206806093454361, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.765625, "frac_reward_zero_std": 0.0, "grad_norm": 10.1875, "kl": 6.555269101227168e-05, "learning_rate": 4e-07, "loss": 0.0, "num_tokens": 1237400.0, "reward": 0.05634097009897232, "reward_std": 0.06590264290571213, "rewards/grpo_reward_func/mean": 0.05634097009897232, "rewards/grpo_reward_func/std": 0.10430527478456497, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.78125, "frac_reward_zero_std": 0.0, "grad_norm": 10.9375, "kl": 0.0001880628988146782, "learning_rate": 4.083333333333333e-07, "loss": 0.0, "num_tokens": 1262712.0, "reward": 0.2721788287162781, "reward_std": 0.0947684645652771, "rewards/grpo_reward_func/mean": 0.2721788287162781, "rewards/grpo_reward_func/std": 0.12943580746650696, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.796875, "frac_reward_zero_std": 0.0, "grad_norm": 8.8125, "kl": 0.0001086403317458462, "learning_rate": 4.1666666666666667e-07, "loss": 0.0, "num_tokens": 1287648.0, "reward": 0.4438517093658447, "reward_std": 0.14172330498695374, "rewards/grpo_reward_func/mean": 0.4438517093658447, "rewards/grpo_reward_func/std": 0.1430020034313202, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.8125, "frac_reward_zero_std": 0.0, "grad_norm": 12.8125, "kl": 0.0003144231850455981, "learning_rate": 4.2499999999999995e-07, "loss": 0.0, "num_tokens": 1312272.0, "reward": 0.299557626247406, "reward_std": 0.11285382509231567, "rewards/grpo_reward_func/mean": 0.299557626247406, "rewards/grpo_reward_func/std": 0.11851979792118073, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.828125, "frac_reward_zero_std": 0.0, "grad_norm": 13.9375, "kl": 0.00012044112008879893, "learning_rate": 4.3333333333333335e-07, "loss": 0.0, "num_tokens": 1337296.0, "reward": 0.4098794460296631, "reward_std": 0.11588951200246811, "rewards/grpo_reward_func/mean": 0.4098794460296631, "rewards/grpo_reward_func/std": 0.11837032437324524, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.84375, "frac_reward_zero_std": 0.0, "grad_norm": 19.25, "kl": 0.00035638889676192775, "learning_rate": 4.4166666666666664e-07, "loss": 0.0, "num_tokens": 1362296.0, "reward": 0.34147408604621887, "reward_std": 0.08064563572406769, "rewards/grpo_reward_func/mean": 0.34147408604621887, "rewards/grpo_reward_func/std": 0.09037595242261887, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.859375, "frac_reward_zero_std": 0.0, "grad_norm": 13.875, "kl": 0.00010125773405889049, "learning_rate": 4.5e-07, "loss": 0.0, "num_tokens": 1387488.0, "reward": 0.34650981426239014, "reward_std": 0.10842345654964447, "rewards/grpo_reward_func/mean": 0.34650981426239014, "rewards/grpo_reward_func/std": 0.1347353607416153, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.875, "frac_reward_zero_std": 0.0, "grad_norm": 10.1875, "kl": 0.00020377027249196544, "learning_rate": 4.5833333333333327e-07, "loss": 0.0, "num_tokens": 1412400.0, "reward": 0.4124220311641693, "reward_std": 0.0689394623041153, "rewards/grpo_reward_func/mean": 0.4124220311641693, "rewards/grpo_reward_func/std": 0.1449054330587387, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.890625, "frac_reward_zero_std": 0.0, "grad_norm": 9.1875, "kl": 0.00019250033437856473, "learning_rate": 4.6666666666666666e-07, "loss": 0.0, "num_tokens": 1437976.0, "reward": 0.2810817062854767, "reward_std": 0.10197813808917999, "rewards/grpo_reward_func/mean": 0.2810817062854767, "rewards/grpo_reward_func/std": 0.2297528237104416, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.90625, "frac_reward_zero_std": 0.0, "grad_norm": 9.375, "kl": 0.00023638892162125558, "learning_rate": 4.7499999999999995e-07, "loss": 0.0, "num_tokens": 1463016.0, "reward": 0.4437309503555298, "reward_std": 0.15493561327457428, "rewards/grpo_reward_func/mean": 0.4437309503555298, "rewards/grpo_reward_func/std": 0.16053226590156555, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.921875, "frac_reward_zero_std": 0.0, "grad_norm": 12.1875, "kl": 0.0003489665687084198, "learning_rate": 4.833333333333333e-07, "loss": 0.0, "num_tokens": 1487920.0, "reward": 0.27588674426078796, "reward_std": 0.09553509950637817, "rewards/grpo_reward_func/mean": 0.27588674426078796, "rewards/grpo_reward_func/std": 0.09245670586824417, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.9375, "frac_reward_zero_std": 0.0, "grad_norm": 14.8125, "kl": 0.00045755444443784654, "learning_rate": 4.916666666666666e-07, "loss": 0.0, "num_tokens": 1513336.0, "reward": 0.08732398599386215, "reward_std": 0.10541350394487381, "rewards/grpo_reward_func/mean": 0.08732398599386215, "rewards/grpo_reward_func/std": 0.10521270334720612, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.953125, "frac_reward_zero_std": 0.0, "grad_norm": 11.5, "kl": 0.000195571225049207, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 1538576.0, "reward": 0.2950664162635803, "reward_std": 0.11005343496799469, "rewards/grpo_reward_func/mean": 0.2950664162635803, "rewards/grpo_reward_func/std": 0.2244093418121338, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.96875, "frac_reward_zero_std": 0.0, "grad_norm": 9.75, "kl": 0.00013369570297072642, "learning_rate": 4.990740740740741e-07, "loss": 0.0, "num_tokens": 1563896.0, "reward": 0.17187045514583588, "reward_std": 0.06180661916732788, "rewards/grpo_reward_func/mean": 0.17187045514583588, "rewards/grpo_reward_func/std": 0.19724208116531372, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.984375, "frac_reward_zero_std": 0.0, "grad_norm": 11.5, "kl": 0.00016097879051812924, "learning_rate": 4.981481481481482e-07, "loss": 0.0, "num_tokens": 1589008.0, "reward": 0.2205008864402771, "reward_std": 0.053931236267089844, "rewards/grpo_reward_func/mean": 0.2205008864402771, "rewards/grpo_reward_func/std": 0.057305920869112015, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 13.75, "kl": 0.00014925623690942302, "learning_rate": 4.972222222222222e-07, "loss": 0.0, "num_tokens": 1613816.0, "reward": 0.4431067705154419, "reward_std": 0.14857327938079834, "rewards/grpo_reward_func/mean": 0.4431067705154419, "rewards/grpo_reward_func/std": 0.17278340458869934, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.015625, "frac_reward_zero_std": 0.0, "grad_norm": 11.5, "kl": 0.0001297382063967234, "learning_rate": 4.962962962962963e-07, "loss": 0.0, "num_tokens": 1639560.0, "reward": 0.2275439351797104, "reward_std": 0.1384904682636261, "rewards/grpo_reward_func/mean": 0.2275439351797104, "rewards/grpo_reward_func/std": 0.1672850400209427, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.03125, "frac_reward_zero_std": 0.0, "grad_norm": 12.3125, "kl": 6.904608108015964e-05, "learning_rate": 4.953703703703703e-07, "loss": 0.0, "num_tokens": 1665128.0, "reward": 0.047979630529880524, "reward_std": 0.11713965237140656, "rewards/grpo_reward_func/mean": 0.047979630529880524, "rewards/grpo_reward_func/std": 0.11486222594976425, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.046875, "frac_reward_zero_std": 0.0, "grad_norm": 13.25, "kl": 5.872082147106994e-05, "learning_rate": 4.944444444444445e-07, "loss": 0.0, "num_tokens": 1690152.0, "reward": 0.29636937379837036, "reward_std": 0.19861865043640137, "rewards/grpo_reward_func/mean": 0.29636937379837036, "rewards/grpo_reward_func/std": 0.20937326550483704, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.0625, "frac_reward_zero_std": 0.0, "grad_norm": 10.625, "kl": 0.0003152005447191186, "learning_rate": 4.935185185185185e-07, "loss": 0.0, "num_tokens": 1715608.0, "reward": 0.2516125440597534, "reward_std": 0.07490938156843185, "rewards/grpo_reward_func/mean": 0.2516125440597534, "rewards/grpo_reward_func/std": 0.14309823513031006, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.078125, "frac_reward_zero_std": 0.0, "grad_norm": 8.875, "kl": 0.0002456077709211968, "learning_rate": 4.925925925925926e-07, "loss": 0.0, "num_tokens": 1741720.0, "reward": 0.11342965066432953, "reward_std": 0.06376887857913971, "rewards/grpo_reward_func/mean": 0.11342965066432953, "rewards/grpo_reward_func/std": 0.11074693500995636, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.09375, "frac_reward_zero_std": 0.0, "grad_norm": 14.0, "kl": 4.5569922804133967e-05, "learning_rate": 4.916666666666666e-07, "loss": 0.0, "num_tokens": 1767576.0, "reward": 0.054858915507793427, "reward_std": 0.13666033744812012, "rewards/grpo_reward_func/mean": 0.054858915507793427, "rewards/grpo_reward_func/std": 0.169277161359787, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.109375, "frac_reward_zero_std": 0.0, "grad_norm": 13.625, "kl": 0.0002712399436859414, "learning_rate": 4.907407407407407e-07, "loss": 0.0, "num_tokens": 1792840.0, "reward": 0.13117259740829468, "reward_std": 0.0819777101278305, "rewards/grpo_reward_func/mean": 0.13117259740829468, "rewards/grpo_reward_func/std": 0.09603901952505112, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.125, "frac_reward_zero_std": 0.0, "grad_norm": 20.125, "kl": 0.00017950467008631676, "learning_rate": 4.898148148148148e-07, "loss": 0.0, "num_tokens": 1818224.0, "reward": 0.2819082736968994, "reward_std": 0.20238551497459412, "rewards/grpo_reward_func/mean": 0.2819082736968994, "rewards/grpo_reward_func/std": 0.2046259194612503, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.140625, "frac_reward_zero_std": 0.0, "grad_norm": 10.6875, "kl": 0.00045007342123426497, "learning_rate": 4.888888888888889e-07, "loss": 0.0, "num_tokens": 1843328.0, "reward": 0.30494439601898193, "reward_std": 0.10446885973215103, "rewards/grpo_reward_func/mean": 0.30494439601898193, "rewards/grpo_reward_func/std": 0.13383115828037262, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.15625, "frac_reward_zero_std": 0.0, "grad_norm": 11.1875, "kl": 0.00025242754054488614, "learning_rate": 4.87962962962963e-07, "loss": 0.0, "num_tokens": 1868936.0, "reward": 0.21628513932228088, "reward_std": 0.09445344656705856, "rewards/grpo_reward_func/mean": 0.21628513932228088, "rewards/grpo_reward_func/std": 0.18989995121955872, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.171875, "frac_reward_zero_std": 0.0, "grad_norm": 16.25, "kl": 0.00047839961189311, "learning_rate": 4.87037037037037e-07, "loss": 0.0, "num_tokens": 1893848.0, "reward": 0.38288578391075134, "reward_std": 0.1278231143951416, "rewards/grpo_reward_func/mean": 0.38288578391075134, "rewards/grpo_reward_func/std": 0.1358482390642166, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.1875, "frac_reward_zero_std": 0.0, "grad_norm": 11.75, "kl": 0.00021722633391618729, "learning_rate": 4.861111111111111e-07, "loss": 0.0, "num_tokens": 1918984.0, "reward": 0.2882534861564636, "reward_std": 0.1554815173149109, "rewards/grpo_reward_func/mean": 0.2882534861564636, "rewards/grpo_reward_func/std": 0.214204341173172, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.203125, "frac_reward_zero_std": 0.0, "grad_norm": 11.375, "kl": 0.00019398704171180725, "learning_rate": 4.851851851851852e-07, "loss": 0.0, "num_tokens": 1944584.0, "reward": 0.21863818168640137, "reward_std": 0.11801601201295853, "rewards/grpo_reward_func/mean": 0.21863818168640137, "rewards/grpo_reward_func/std": 0.1758543998003006, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.21875, "frac_reward_zero_std": 0.0, "grad_norm": 15.375, "kl": 0.00019207410514354706, "learning_rate": 4.842592592592593e-07, "loss": 0.0, "num_tokens": 1969664.0, "reward": 0.41489866375923157, "reward_std": 0.1088038831949234, "rewards/grpo_reward_func/mean": 0.41489866375923157, "rewards/grpo_reward_func/std": 0.17758165299892426, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.234375, "frac_reward_zero_std": 0.0, "grad_norm": 9.9375, "kl": 0.00017749394464772195, "learning_rate": 4.833333333333333e-07, "loss": 0.0, "num_tokens": 1994640.0, "reward": 0.42801088094711304, "reward_std": 0.10921289026737213, "rewards/grpo_reward_func/mean": 0.42801088094711304, "rewards/grpo_reward_func/std": 0.14644165337085724, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.25, "frac_reward_zero_std": 0.0, "grad_norm": 10.6875, "kl": 0.00038930773735046387, "learning_rate": 4.824074074074074e-07, "loss": 0.0, "num_tokens": 2020152.0, "reward": 0.16573889553546906, "reward_std": 0.08702096343040466, "rewards/grpo_reward_func/mean": 0.16573889553546906, "rewards/grpo_reward_func/std": 0.12194234132766724, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.265625, "frac_reward_zero_std": 0.0, "grad_norm": 7.96875, "kl": 0.00011919004100491293, "learning_rate": 4.814814814814814e-07, "loss": 0.0, "num_tokens": 2044944.0, "reward": 0.41396480798721313, "reward_std": 0.09650741517543793, "rewards/grpo_reward_func/mean": 0.41396480798721313, "rewards/grpo_reward_func/std": 0.11427146941423416, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.28125, "frac_reward_zero_std": 0.0, "grad_norm": 7.75, "kl": 0.0002481179908500053, "learning_rate": 4.805555555555555e-07, "loss": 0.0, "num_tokens": 2070520.0, "reward": 0.22157645225524902, "reward_std": 0.09660083055496216, "rewards/grpo_reward_func/mean": 0.22157645225524902, "rewards/grpo_reward_func/std": 0.20230649411678314, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.296875, "frac_reward_zero_std": 0.0, "grad_norm": 11.5625, "kl": 0.00045967100595589727, "learning_rate": 4.796296296296296e-07, "loss": 0.0, "num_tokens": 2095464.0, "reward": 0.5041211843490601, "reward_std": 0.1389543116092682, "rewards/grpo_reward_func/mean": 0.5041211843490601, "rewards/grpo_reward_func/std": 0.1713102161884308, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.3125, "frac_reward_zero_std": 0.0, "grad_norm": 6.03125, "kl": 0.0001403558962920215, "learning_rate": 4.787037037037037e-07, "loss": 0.0, "num_tokens": 2120688.0, "reward": 0.18113520741462708, "reward_std": 0.04671812802553177, "rewards/grpo_reward_func/mean": 0.18113520741462708, "rewards/grpo_reward_func/std": 0.12348335981369019, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.328125, "frac_reward_zero_std": 0.0, "grad_norm": 16.25, "kl": 0.00013873229181626812, "learning_rate": 4.777777777777778e-07, "loss": 0.0, "num_tokens": 2146368.0, "reward": 0.04078834131360054, "reward_std": 0.12815354764461517, "rewards/grpo_reward_func/mean": 0.04078834131360054, "rewards/grpo_reward_func/std": 0.12462079524993896, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.34375, "frac_reward_zero_std": 0.0, "grad_norm": 16.375, "kl": 0.0004987806605640799, "learning_rate": 4.768518518518518e-07, "loss": 0.0, "num_tokens": 2171880.0, "reward": 0.12024304270744324, "reward_std": 0.1480271965265274, "rewards/grpo_reward_func/mean": 0.12024304270744324, "rewards/grpo_reward_func/std": 0.1638474464416504, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.359375, "frac_reward_zero_std": 0.0, "grad_norm": 11.5, "kl": 0.0003001453878823668, "learning_rate": 4.759259259259259e-07, "loss": 0.0, "num_tokens": 2196888.0, "reward": 0.3662007451057434, "reward_std": 0.10381059348583221, "rewards/grpo_reward_func/mean": 0.3662007451057434, "rewards/grpo_reward_func/std": 0.10623158514499664, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.375, "frac_reward_zero_std": 0.0, "grad_norm": 8.875, "kl": 0.00020138671243330464, "learning_rate": 4.7499999999999995e-07, "loss": 0.0, "num_tokens": 2221912.0, "reward": 0.3808116912841797, "reward_std": 0.06605124473571777, "rewards/grpo_reward_func/mean": 0.3808116912841797, "rewards/grpo_reward_func/std": 0.0681883841753006, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.390625, "frac_reward_zero_std": 0.0, "grad_norm": 13.6875, "kl": 0.00010015349835157394, "learning_rate": 4.7407407407407405e-07, "loss": 0.0, "num_tokens": 2247072.0, "reward": 0.29432040452957153, "reward_std": 0.06491278856992722, "rewards/grpo_reward_func/mean": 0.29432040452957153, "rewards/grpo_reward_func/std": 0.11752089112997055, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.40625, "frac_reward_zero_std": 0.0, "grad_norm": 12.1875, "kl": 0.000202179577172501, "learning_rate": 4.731481481481481e-07, "loss": 0.0, "num_tokens": 2272480.0, "reward": 0.2932785451412201, "reward_std": 0.08380497992038727, "rewards/grpo_reward_func/mean": 0.2932785451412201, "rewards/grpo_reward_func/std": 0.23896603286266327, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.421875, "frac_reward_zero_std": 0.0, "grad_norm": 11.6875, "kl": 0.0004609463067026809, "learning_rate": 4.722222222222222e-07, "loss": 0.0, "num_tokens": 2297896.0, "reward": 0.21311567723751068, "reward_std": 0.09745917469263077, "rewards/grpo_reward_func/mean": 0.21311567723751068, "rewards/grpo_reward_func/std": 0.16419465839862823, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.4375, "frac_reward_zero_std": 0.0, "grad_norm": 12.5, "kl": 0.0002996331677422859, "learning_rate": 4.7129629629629626e-07, "loss": 0.0, "num_tokens": 2322992.0, "reward": 0.2457921952009201, "reward_std": 0.2100134938955307, "rewards/grpo_reward_func/mean": 0.2457921952009201, "rewards/grpo_reward_func/std": 0.2291945517063141, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.453125, "frac_reward_zero_std": 0.0, "grad_norm": 11.25, "kl": 0.00025584021204849705, "learning_rate": 4.7037037037037036e-07, "loss": 0.0, "num_tokens": 2348168.0, "reward": 0.26080936193466187, "reward_std": 0.10516969859600067, "rewards/grpo_reward_func/mean": 0.26080936193466187, "rewards/grpo_reward_func/std": 0.13873416185379028, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.46875, "frac_reward_zero_std": 0.0, "grad_norm": 13.125, "kl": 0.00024177134764613584, "learning_rate": 4.694444444444444e-07, "loss": 0.0, "num_tokens": 2373200.0, "reward": 0.20706593990325928, "reward_std": 0.09836722910404205, "rewards/grpo_reward_func/mean": 0.20706593990325928, "rewards/grpo_reward_func/std": 0.14156295359134674, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.484375, "frac_reward_zero_std": 0.0, "grad_norm": 12.1875, "kl": 0.00038407588726840913, "learning_rate": 4.6851851851851846e-07, "loss": 0.0, "num_tokens": 2398136.0, "reward": 0.4286128580570221, "reward_std": 0.14354585111141205, "rewards/grpo_reward_func/mean": 0.4286128580570221, "rewards/grpo_reward_func/std": 0.1891767978668213, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.5, "frac_reward_zero_std": 0.0, "grad_norm": 6.40625, "kl": 0.0002914089673140552, "learning_rate": 4.675925925925926e-07, "loss": 0.0, "num_tokens": 2423008.0, "reward": 0.32350456714630127, "reward_std": 0.07581804692745209, "rewards/grpo_reward_func/mean": 0.32350456714630127, "rewards/grpo_reward_func/std": 0.14216163754463196, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.515625, "frac_reward_zero_std": 0.0, "grad_norm": 9.75, "kl": 0.00029821879797964357, "learning_rate": 4.6666666666666666e-07, "loss": 0.0, "num_tokens": 2447880.0, "reward": 0.53780198097229, "reward_std": 0.13564899563789368, "rewards/grpo_reward_func/mean": 0.53780198097229, "rewards/grpo_reward_func/std": 0.13389934599399567, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.53125, "frac_reward_zero_std": 0.0, "grad_norm": 12.9375, "kl": 0.00042561162263154984, "learning_rate": 4.657407407407407e-07, "loss": 0.0, "num_tokens": 2473776.0, "reward": 0.0427071787416935, "reward_std": 0.08712250739336014, "rewards/grpo_reward_func/mean": 0.0427071787416935, "rewards/grpo_reward_func/std": 0.08546540886163712, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.546875, "frac_reward_zero_std": 0.0, "grad_norm": 6.84375, "kl": 0.0001698797568678856, "learning_rate": 4.6481481481481476e-07, "loss": 0.0, "num_tokens": 2499040.0, "reward": 0.24955210089683533, "reward_std": 0.06630256026983261, "rewards/grpo_reward_func/mean": 0.24955210089683533, "rewards/grpo_reward_func/std": 0.09607692807912827, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.5625, "frac_reward_zero_std": 0.0, "grad_norm": 10.8125, "kl": 0.0011094513902207837, "learning_rate": 4.6388888888888886e-07, "loss": 0.0, "num_tokens": 2524040.0, "reward": 0.37988805770874023, "reward_std": 0.06495396792888641, "rewards/grpo_reward_func/mean": 0.37988805770874023, "rewards/grpo_reward_func/std": 0.07306870073080063, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.578125, "frac_reward_zero_std": 0.0, "grad_norm": 9.0625, "kl": 0.0002060849146801047, "learning_rate": 4.6296296296296297e-07, "loss": 0.0, "num_tokens": 2548920.0, "reward": 0.3687247037887573, "reward_std": 0.09085649251937866, "rewards/grpo_reward_func/mean": 0.3687247037887573, "rewards/grpo_reward_func/std": 0.09450332075357437, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.59375, "frac_reward_zero_std": 0.0, "grad_norm": 12.875, "kl": 0.00026761522894958034, "learning_rate": 4.62037037037037e-07, "loss": 0.0, "num_tokens": 2574456.0, "reward": 0.09724560379981995, "reward_std": 0.16156351566314697, "rewards/grpo_reward_func/mean": 0.09724560379981995, "rewards/grpo_reward_func/std": 0.16980990767478943, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.609375, "frac_reward_zero_std": 0.0, "grad_norm": 10.75, "kl": 0.0004442011268110946, "learning_rate": 4.611111111111111e-07, "loss": 0.0, "num_tokens": 2599584.0, "reward": 0.2229943573474884, "reward_std": 0.0884079784154892, "rewards/grpo_reward_func/mean": 0.2229943573474884, "rewards/grpo_reward_func/std": 0.09148803353309631, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.625, "frac_reward_zero_std": 0.0, "grad_norm": 13.75, "kl": 0.00045584855251945555, "learning_rate": 4.6018518518518517e-07, "loss": 0.0, "num_tokens": 2624336.0, "reward": 0.4652545750141144, "reward_std": 0.09365322440862656, "rewards/grpo_reward_func/mean": 0.4652545750141144, "rewards/grpo_reward_func/std": 0.09660826623439789, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.640625, "frac_reward_zero_std": 0.0, "grad_norm": 12.625, "kl": 0.000538857959327288, "learning_rate": 4.592592592592592e-07, "loss": 0.0, "num_tokens": 2650008.0, "reward": 0.22665664553642273, "reward_std": 0.08802333474159241, "rewards/grpo_reward_func/mean": 0.22665664553642273, "rewards/grpo_reward_func/std": 0.15664224326610565, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.65625, "frac_reward_zero_std": 0.0, "grad_norm": 12.3125, "kl": 0.0010045859962701797, "learning_rate": 4.5833333333333327e-07, "loss": 0.0, "num_tokens": 2675152.0, "reward": 0.27369487285614014, "reward_std": 0.23123349249362946, "rewards/grpo_reward_func/mean": 0.27369487285614014, "rewards/grpo_reward_func/std": 0.24010290205478668, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.671875, "frac_reward_zero_std": 0.0, "grad_norm": 15.875, "kl": 0.00037650827107427176, "learning_rate": 4.574074074074074e-07, "loss": 0.0, "num_tokens": 2700032.0, "reward": 0.3958902955055237, "reward_std": 0.17344766855239868, "rewards/grpo_reward_func/mean": 0.3958902955055237, "rewards/grpo_reward_func/std": 0.17110610008239746, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.6875, "frac_reward_zero_std": 0.0, "grad_norm": 15.375, "kl": 0.0003967365773860365, "learning_rate": 4.564814814814815e-07, "loss": 0.0, "num_tokens": 2725416.0, "reward": 0.31058400869369507, "reward_std": 0.1508956253528595, "rewards/grpo_reward_func/mean": 0.31058400869369507, "rewards/grpo_reward_func/std": 0.19560779631137848, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.703125, "frac_reward_zero_std": 0.0, "grad_norm": 7.90625, "kl": 0.0006652238371316344, "learning_rate": 4.555555555555555e-07, "loss": 0.0, "num_tokens": 2750208.0, "reward": 0.4809446930885315, "reward_std": 0.14983296394348145, "rewards/grpo_reward_func/mean": 0.4809446930885315, "rewards/grpo_reward_func/std": 0.14483514428138733, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.71875, "frac_reward_zero_std": 0.0, "grad_norm": 9.4375, "kl": 0.00035489382571540773, "learning_rate": 4.5462962962962957e-07, "loss": 0.0, "num_tokens": 2775504.0, "reward": 0.22507444024085999, "reward_std": 0.08060501515865326, "rewards/grpo_reward_func/mean": 0.22507444024085999, "rewards/grpo_reward_func/std": 0.16900670528411865, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.734375, "frac_reward_zero_std": 0.0, "grad_norm": 9.625, "kl": 0.0005258495511952788, "learning_rate": 4.537037037037037e-07, "loss": 0.0, "num_tokens": 2800328.0, "reward": 0.3424970507621765, "reward_std": 0.13174626231193542, "rewards/grpo_reward_func/mean": 0.3424970507621765, "rewards/grpo_reward_func/std": 0.1453038901090622, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.75, "frac_reward_zero_std": 0.0, "grad_norm": 14.8125, "kl": 0.0002995077520608902, "learning_rate": 4.527777777777778e-07, "loss": 0.0, "num_tokens": 2825752.0, "reward": 0.23321793973445892, "reward_std": 0.09155033528804779, "rewards/grpo_reward_func/mean": 0.23321793973445892, "rewards/grpo_reward_func/std": 0.09220299124717712, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.765625, "frac_reward_zero_std": 0.0, "grad_norm": 7.09375, "kl": 0.0007548263820353895, "learning_rate": 4.5185185185185183e-07, "loss": 0.0, "num_tokens": 2850680.0, "reward": 0.3993395268917084, "reward_std": 0.12416817247867584, "rewards/grpo_reward_func/mean": 0.3993395268917084, "rewards/grpo_reward_func/std": 0.1299920380115509, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.78125, "frac_reward_zero_std": 0.0, "grad_norm": 10.0, "kl": 0.00034774900996126235, "learning_rate": 4.5092592592592593e-07, "loss": 0.0, "num_tokens": 2876104.0, "reward": 0.3269900381565094, "reward_std": 0.0974041149020195, "rewards/grpo_reward_func/mean": 0.3269900381565094, "rewards/grpo_reward_func/std": 0.24951301515102386, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.796875, "frac_reward_zero_std": 0.0, "grad_norm": 13.875, "kl": 0.0004522266535786912, "learning_rate": 4.5e-07, "loss": 0.0, "num_tokens": 2901024.0, "reward": 0.42044663429260254, "reward_std": 0.1444629728794098, "rewards/grpo_reward_func/mean": 0.42044663429260254, "rewards/grpo_reward_func/std": 0.14634843170642853, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.8125, "frac_reward_zero_std": 0.0, "grad_norm": 13.8125, "kl": 0.0006757595692761242, "learning_rate": 4.4907407407407403e-07, "loss": 0.0, "num_tokens": 2926040.0, "reward": 0.34109240770339966, "reward_std": 0.133412703871727, "rewards/grpo_reward_func/mean": 0.34109240770339966, "rewards/grpo_reward_func/std": 0.12967567145824432, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.828125, "frac_reward_zero_std": 0.0, "grad_norm": 14.8125, "kl": 0.0002280392945976928, "learning_rate": 4.4814814814814813e-07, "loss": 0.0, "num_tokens": 2951408.0, "reward": 0.292447566986084, "reward_std": 0.14831313490867615, "rewards/grpo_reward_func/mean": 0.292447566986084, "rewards/grpo_reward_func/std": 0.2516910433769226, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.84375, "frac_reward_zero_std": 0.0, "grad_norm": 6.875, "kl": 0.0001139113082899712, "learning_rate": 4.4722222222222223e-07, "loss": 0.0, "num_tokens": 2976456.0, "reward": 0.3284240961074829, "reward_std": 0.05646292120218277, "rewards/grpo_reward_func/mean": 0.3284240961074829, "rewards/grpo_reward_func/std": 0.08714665472507477, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.859375, "frac_reward_zero_std": 0.0, "grad_norm": 13.5625, "kl": 0.00031560100615024567, "learning_rate": 4.462962962962963e-07, "loss": 0.0, "num_tokens": 3001344.0, "reward": 0.3213407099246979, "reward_std": 0.07400795072317123, "rewards/grpo_reward_func/mean": 0.3213407099246979, "rewards/grpo_reward_func/std": 0.07413279265165329, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.875, "frac_reward_zero_std": 0.0, "grad_norm": 10.6875, "kl": 0.00014330726116895676, "learning_rate": 4.4537037037037033e-07, "loss": 0.0, "num_tokens": 3026520.0, "reward": 0.2996346056461334, "reward_std": 0.10851671546697617, "rewards/grpo_reward_func/mean": 0.2996346056461334, "rewards/grpo_reward_func/std": 0.14822526276111603, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.890625, "frac_reward_zero_std": 0.0, "grad_norm": 11.1875, "kl": 0.0003187606780556962, "learning_rate": 4.444444444444444e-07, "loss": 0.0, "num_tokens": 3051592.0, "reward": 0.3903766870498657, "reward_std": 0.09058903157711029, "rewards/grpo_reward_func/mean": 0.3903766870498657, "rewards/grpo_reward_func/std": 0.0985388308763504, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.90625, "frac_reward_zero_std": 0.0, "grad_norm": 10.5625, "kl": 0.0011158532870467752, "learning_rate": 4.4351851851851854e-07, "loss": 0.0, "num_tokens": 3076160.0, "reward": 0.4783337712287903, "reward_std": 0.09513237327337265, "rewards/grpo_reward_func/mean": 0.4783337712287903, "rewards/grpo_reward_func/std": 0.15691599249839783, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.921875, "frac_reward_zero_std": 0.0, "grad_norm": 9.75, "kl": 0.0004599780368153006, "learning_rate": 4.425925925925926e-07, "loss": 0.0, "num_tokens": 3101632.0, "reward": 0.21846909821033478, "reward_std": 0.1349727213382721, "rewards/grpo_reward_func/mean": 0.21846909821033478, "rewards/grpo_reward_func/std": 0.1770293265581131, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.9375, "frac_reward_zero_std": 0.0, "grad_norm": 14.0625, "kl": 0.00045861614489695057, "learning_rate": 4.4166666666666664e-07, "loss": 0.0, "num_tokens": 3126688.0, "reward": 0.35453712940216064, "reward_std": 0.07507544755935669, "rewards/grpo_reward_func/mean": 0.35453712940216064, "rewards/grpo_reward_func/std": 0.08851905167102814, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.953125, "frac_reward_zero_std": 0.0, "grad_norm": 12.375, "kl": 0.000633828341960907, "learning_rate": 4.4074074074074074e-07, "loss": 0.0, "num_tokens": 3151600.0, "reward": 0.42087167501449585, "reward_std": 0.10288920998573303, "rewards/grpo_reward_func/mean": 0.42087167501449585, "rewards/grpo_reward_func/std": 0.2240506261587143, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.96875, "frac_reward_zero_std": 0.0, "grad_norm": 5.125, "kl": 0.0003298576921224594, "learning_rate": 4.398148148148148e-07, "loss": 0.0, "num_tokens": 3177000.0, "reward": 0.20600494742393494, "reward_std": 0.07367828488349915, "rewards/grpo_reward_func/mean": 0.20600494742393494, "rewards/grpo_reward_func/std": 0.15181554853916168, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.984375, "frac_reward_zero_std": 0.0, "grad_norm": 13.8125, "kl": 0.0007094539323588833, "learning_rate": 4.3888888888888884e-07, "loss": 0.0, "num_tokens": 3202096.0, "reward": 0.3842203915119171, "reward_std": 0.09158715605735779, "rewards/grpo_reward_func/mean": 0.3842203915119171, "rewards/grpo_reward_func/std": 0.18436087667942047, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.0, "frac_reward_zero_std": 0.0, "grad_norm": 7.0625, "kl": 0.0004973138275090605, "learning_rate": 4.3796296296296294e-07, "loss": 0.0, "num_tokens": 3227632.0, "reward": 0.14040973782539368, "reward_std": 0.051458939909935, "rewards/grpo_reward_func/mean": 0.14040973782539368, "rewards/grpo_reward_func/std": 0.05102093145251274, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.015625, "frac_reward_zero_std": 0.0, "grad_norm": 15.875, "kl": 0.0005900245305383578, "learning_rate": 4.3703703703703704e-07, "loss": 0.0, "num_tokens": 3252728.0, "reward": 0.23317044973373413, "reward_std": 0.12486948072910309, "rewards/grpo_reward_func/mean": 0.23317044973373413, "rewards/grpo_reward_func/std": 0.18154384195804596, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.03125, "frac_reward_zero_std": 0.0, "grad_norm": 10.4375, "kl": 0.0003436406550463289, "learning_rate": 4.361111111111111e-07, "loss": 0.0, "num_tokens": 3277808.0, "reward": 0.2726486921310425, "reward_std": 0.10262490808963776, "rewards/grpo_reward_func/mean": 0.2726486921310425, "rewards/grpo_reward_func/std": 0.1630534678697586, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.046875, "frac_reward_zero_std": 0.0, "grad_norm": 8.1875, "kl": 0.0006743809208273888, "learning_rate": 4.3518518518518514e-07, "loss": 0.0, "num_tokens": 3302816.0, "reward": 0.36714380979537964, "reward_std": 0.05760783702135086, "rewards/grpo_reward_func/mean": 0.36714380979537964, "rewards/grpo_reward_func/std": 0.06256558746099472, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.0625, "frac_reward_zero_std": 0.0, "grad_norm": 12.125, "kl": 0.00044845137745141983, "learning_rate": 4.342592592592592e-07, "loss": 0.0, "num_tokens": 3327936.0, "reward": 0.33067017793655396, "reward_std": 0.08341982960700989, "rewards/grpo_reward_func/mean": 0.33067017793655396, "rewards/grpo_reward_func/std": 0.09450868517160416, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.078125, "frac_reward_zero_std": 0.0, "grad_norm": 9.5625, "kl": 0.0001626002267585136, "learning_rate": 4.3333333333333335e-07, "loss": 0.0, "num_tokens": 3353384.0, "reward": 0.1081024780869484, "reward_std": 0.04969579726457596, "rewards/grpo_reward_func/mean": 0.1081024780869484, "rewards/grpo_reward_func/std": 0.07089214026927948, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.09375, "frac_reward_zero_std": 0.0, "grad_norm": 4.9375, "kl": 0.0006542075570905581, "learning_rate": 4.324074074074074e-07, "loss": 0.0, "num_tokens": 3378104.0, "reward": 0.5095803737640381, "reward_std": 0.07591477781534195, "rewards/grpo_reward_func/mean": 0.5095803737640381, "rewards/grpo_reward_func/std": 0.15140148997306824, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.109375, "frac_reward_zero_std": 0.0, "grad_norm": 10.6875, "kl": 0.00048515634262003005, "learning_rate": 4.3148148148148145e-07, "loss": 0.0, "num_tokens": 3403664.0, "reward": 0.28993886709213257, "reward_std": 0.11650910973548889, "rewards/grpo_reward_func/mean": 0.28993886709213257, "rewards/grpo_reward_func/std": 0.2471858263015747, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.125, "frac_reward_zero_std": 0.0, "grad_norm": 11.9375, "kl": 0.000738232396543026, "learning_rate": 4.3055555555555555e-07, "loss": 0.0, "num_tokens": 3428648.0, "reward": 0.29969072341918945, "reward_std": 0.077365942299366, "rewards/grpo_reward_func/mean": 0.29969072341918945, "rewards/grpo_reward_func/std": 0.07797099649906158, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.140625, "frac_reward_zero_std": 0.0, "grad_norm": 12.8125, "kl": 0.0003166009337292053, "learning_rate": 4.296296296296296e-07, "loss": 0.0, "num_tokens": 3453600.0, "reward": 0.5390628576278687, "reward_std": 0.09630399942398071, "rewards/grpo_reward_func/mean": 0.5390628576278687, "rewards/grpo_reward_func/std": 0.10668856650590897, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.15625, "frac_reward_zero_std": 0.0, "grad_norm": 8.125, "kl": 0.00017619847494643182, "learning_rate": 4.287037037037037e-07, "loss": 0.0, "num_tokens": 3479008.0, "reward": 0.08932866156101227, "reward_std": 0.11037556082010269, "rewards/grpo_reward_func/mean": 0.08932866156101227, "rewards/grpo_reward_func/std": 0.13285614550113678, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.171875, "frac_reward_zero_std": 0.0, "grad_norm": 8.0625, "kl": 0.0005297964962664992, "learning_rate": 4.2777777777777775e-07, "loss": 0.0, "num_tokens": 3503672.0, "reward": 0.410762220621109, "reward_std": 0.11265414208173752, "rewards/grpo_reward_func/mean": 0.410762220621109, "rewards/grpo_reward_func/std": 0.11756953597068787, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.1875, "frac_reward_zero_std": 0.0, "grad_norm": 11.8125, "kl": 0.0002442055119900033, "learning_rate": 4.2685185185185186e-07, "loss": 0.0, "num_tokens": 3528696.0, "reward": 0.35452505946159363, "reward_std": 0.08229420334100723, "rewards/grpo_reward_func/mean": 0.35452505946159363, "rewards/grpo_reward_func/std": 0.10353206098079681, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.203125, "frac_reward_zero_std": 0.0, "grad_norm": 16.5, "kl": 0.00030247091854107566, "learning_rate": 4.259259259259259e-07, "loss": 0.0, "num_tokens": 3554200.0, "reward": 0.10482534766197205, "reward_std": 0.16839897632598877, "rewards/grpo_reward_func/mean": 0.10482534766197205, "rewards/grpo_reward_func/std": 0.1643439382314682, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.21875, "frac_reward_zero_std": 0.0, "grad_norm": 11.9375, "kl": 0.0009008496999740601, "learning_rate": 4.2499999999999995e-07, "loss": 0.0, "num_tokens": 3579256.0, "reward": 0.344069242477417, "reward_std": 0.12131404876708984, "rewards/grpo_reward_func/mean": 0.344069242477417, "rewards/grpo_reward_func/std": 0.18182261288166046, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.234375, "frac_reward_zero_std": 0.0, "grad_norm": 8.1875, "kl": 0.0005545982421608642, "learning_rate": 4.24074074074074e-07, "loss": 0.0, "num_tokens": 3605048.0, "reward": 0.1483575701713562, "reward_std": 0.05153876543045044, "rewards/grpo_reward_func/mean": 0.1483575701713562, "rewards/grpo_reward_func/std": 0.09833282232284546, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.25, "frac_reward_zero_std": 0.0, "grad_norm": 8.125, "kl": 0.0008367840491700917, "learning_rate": 4.2314814814814816e-07, "loss": 0.0, "num_tokens": 3630656.0, "reward": 0.2435261607170105, "reward_std": 0.10582385957241058, "rewards/grpo_reward_func/mean": 0.2435261607170105, "rewards/grpo_reward_func/std": 0.2306629717350006, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.265625, "frac_reward_zero_std": 0.0, "grad_norm": 11.75, "kl": 0.0006366781890392303, "learning_rate": 4.222222222222222e-07, "loss": 0.0, "num_tokens": 3655736.0, "reward": 0.25244393944740295, "reward_std": 0.13968312740325928, "rewards/grpo_reward_func/mean": 0.25244393944740295, "rewards/grpo_reward_func/std": 0.20445255935192108, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.28125, "frac_reward_zero_std": 0.0, "grad_norm": 8.9375, "kl": 0.0004489117636694573, "learning_rate": 4.2129629629629626e-07, "loss": 0.0, "num_tokens": 3681032.0, "reward": 0.2341691255569458, "reward_std": 0.08564946055412292, "rewards/grpo_reward_func/mean": 0.2341691255569458, "rewards/grpo_reward_func/std": 0.14472095668315887, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.296875, "frac_reward_zero_std": 0.0, "grad_norm": 9.875, "kl": 0.0003370772956259316, "learning_rate": 4.2037037037037036e-07, "loss": 0.0, "num_tokens": 3705952.0, "reward": 0.30148059129714966, "reward_std": 0.07144688069820404, "rewards/grpo_reward_func/mean": 0.30148059129714966, "rewards/grpo_reward_func/std": 0.18274889886379242, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.3125, "frac_reward_zero_std": 0.0, "grad_norm": 14.25, "kl": 0.0002702403216972016, "learning_rate": 4.194444444444444e-07, "loss": 0.0, "num_tokens": 3731040.0, "reward": 0.36687034368515015, "reward_std": 0.13937704265117645, "rewards/grpo_reward_func/mean": 0.36687034368515015, "rewards/grpo_reward_func/std": 0.14671309292316437, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.328125, "frac_reward_zero_std": 0.0, "grad_norm": 9.1875, "kl": 0.0001179131959361257, "learning_rate": 4.185185185185185e-07, "loss": 0.0, "num_tokens": 3755968.0, "reward": 0.35363078117370605, "reward_std": 0.0943349301815033, "rewards/grpo_reward_func/mean": 0.35363078117370605, "rewards/grpo_reward_func/std": 0.10976386070251465, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.34375, "frac_reward_zero_std": 0.0, "grad_norm": 6.5, "kl": 0.0003840612989733927, "learning_rate": 4.1759259259259256e-07, "loss": 0.0, "num_tokens": 3781184.0, "reward": 0.2982335686683655, "reward_std": 0.09461906552314758, "rewards/grpo_reward_func/mean": 0.2982335686683655, "rewards/grpo_reward_func/std": 0.15051327645778656, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.359375, "frac_reward_zero_std": 0.0, "grad_norm": 9.0625, "kl": 0.0006028078350936994, "learning_rate": 4.1666666666666667e-07, "loss": 0.0, "num_tokens": 3806432.0, "reward": 0.22157108783721924, "reward_std": 0.07859226316213608, "rewards/grpo_reward_func/mean": 0.22157108783721924, "rewards/grpo_reward_func/std": 0.17600607872009277, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.375, "frac_reward_zero_std": 0.0, "grad_norm": 8.4375, "kl": 0.00030071971559664235, "learning_rate": 4.157407407407407e-07, "loss": 0.0, "num_tokens": 3831640.0, "reward": 0.2901885509490967, "reward_std": 0.11274297535419464, "rewards/grpo_reward_func/mean": 0.2901885509490967, "rewards/grpo_reward_func/std": 0.21348397433757782, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.390625, "frac_reward_zero_std": 0.0, "grad_norm": 12.625, "kl": 0.0004837081505684182, "learning_rate": 4.1481481481481476e-07, "loss": 0.0, "num_tokens": 3857408.0, "reward": 0.09370775520801544, "reward_std": 0.10032463073730469, "rewards/grpo_reward_func/mean": 0.09370775520801544, "rewards/grpo_reward_func/std": 0.11353815346956253, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.40625, "frac_reward_zero_std": 0.0, "grad_norm": 13.1875, "kl": 0.0006801660056225955, "learning_rate": 4.1388888888888887e-07, "loss": 0.0, "num_tokens": 3882816.0, "reward": 0.2329578548669815, "reward_std": 0.12824667990207672, "rewards/grpo_reward_func/mean": 0.2329578548669815, "rewards/grpo_reward_func/std": 0.24429479241371155, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.421875, "frac_reward_zero_std": 0.0, "grad_norm": 13.125, "kl": 0.00012743038450935273, "learning_rate": 4.1296296296296297e-07, "loss": 0.0, "num_tokens": 3908224.0, "reward": 0.22462239861488342, "reward_std": 0.06313692033290863, "rewards/grpo_reward_func/mean": 0.22462239861488342, "rewards/grpo_reward_func/std": 0.15244141221046448, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.4375, "frac_reward_zero_std": 0.0, "grad_norm": 9.875, "kl": 0.0007045886595733464, "learning_rate": 4.12037037037037e-07, "loss": 0.0, "num_tokens": 3933728.0, "reward": 0.07521216571331024, "reward_std": 0.07558181881904602, "rewards/grpo_reward_func/mean": 0.07521216571331024, "rewards/grpo_reward_func/std": 0.09718126058578491, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.453125, "frac_reward_zero_std": 0.0, "grad_norm": 12.375, "kl": 0.00023933003103593364, "learning_rate": 4.1111111111111107e-07, "loss": 0.0, "num_tokens": 3958840.0, "reward": 0.2714136838912964, "reward_std": 0.13314189016819, "rewards/grpo_reward_func/mean": 0.2714136838912964, "rewards/grpo_reward_func/std": 0.1861964911222458, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.46875, "frac_reward_zero_std": 0.0, "grad_norm": 7.75, "kl": 0.00025627949071349576, "learning_rate": 4.1018518518518517e-07, "loss": 0.0, "num_tokens": 3983984.0, "reward": 0.30816277861595154, "reward_std": 0.07617802917957306, "rewards/grpo_reward_func/mean": 0.30816277861595154, "rewards/grpo_reward_func/std": 0.09437035024166107, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.484375, "frac_reward_zero_std": 0.0, "grad_norm": 9.9375, "kl": 0.0006072036921977997, "learning_rate": 4.092592592592593e-07, "loss": 0.0, "num_tokens": 4009128.0, "reward": 0.3487386405467987, "reward_std": 0.10052464157342911, "rewards/grpo_reward_func/mean": 0.3487386405467987, "rewards/grpo_reward_func/std": 0.09949901700019836, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.5, "frac_reward_zero_std": 0.0, "grad_norm": 11.6875, "kl": 0.0005894883797736838, "learning_rate": 4.083333333333333e-07, "loss": 0.0, "num_tokens": 4034272.0, "reward": 0.20100285112857819, "reward_std": 0.07660327851772308, "rewards/grpo_reward_func/mean": 0.20100285112857819, "rewards/grpo_reward_func/std": 0.10585327446460724, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.515625, "frac_reward_zero_std": 0.0, "grad_norm": 9.5, "kl": 0.0003513234405545518, "learning_rate": 4.0740740740740737e-07, "loss": 0.0, "num_tokens": 4060464.0, "reward": 0.17975829541683197, "reward_std": 0.12321469932794571, "rewards/grpo_reward_func/mean": 0.17975829541683197, "rewards/grpo_reward_func/std": 0.19033879041671753, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.53125, "frac_reward_zero_std": 0.0, "grad_norm": 15.5625, "kl": 0.0007510657014790922, "learning_rate": 4.064814814814815e-07, "loss": 0.0, "num_tokens": 4085416.0, "reward": 0.4079420566558838, "reward_std": 0.1232195794582367, "rewards/grpo_reward_func/mean": 0.4079420566558838, "rewards/grpo_reward_func/std": 0.12682023644447327, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.546875, "frac_reward_zero_std": 0.0, "grad_norm": 12.8125, "kl": 0.0005510623304871842, "learning_rate": 4.055555555555555e-07, "loss": 0.0, "num_tokens": 4110488.0, "reward": 0.25836923718452454, "reward_std": 0.08329159766435623, "rewards/grpo_reward_func/mean": 0.25836923718452454, "rewards/grpo_reward_func/std": 0.08466833829879761, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.5625, "frac_reward_zero_std": 0.0, "grad_norm": 17.125, "kl": 0.0007114599866326898, "learning_rate": 4.046296296296296e-07, "loss": 0.0, "num_tokens": 4136408.0, "reward": 0.12126512080430984, "reward_std": 0.08648187667131424, "rewards/grpo_reward_func/mean": 0.12126512080430984, "rewards/grpo_reward_func/std": 0.10869091749191284, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.578125, "frac_reward_zero_std": 0.0, "grad_norm": 11.5625, "kl": 0.00045018985110800713, "learning_rate": 4.0370370370370373e-07, "loss": 0.0, "num_tokens": 4161264.0, "reward": 0.4755350947380066, "reward_std": 0.11967408657073975, "rewards/grpo_reward_func/mean": 0.4755350947380066, "rewards/grpo_reward_func/std": 0.13084648549556732, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.59375, "frac_reward_zero_std": 0.0, "grad_norm": 9.5, "kl": 0.0006938679289305583, "learning_rate": 4.027777777777778e-07, "loss": 0.0, "num_tokens": 4186760.0, "reward": 0.27864474058151245, "reward_std": 0.13842284679412842, "rewards/grpo_reward_func/mean": 0.27864474058151245, "rewards/grpo_reward_func/std": 0.26182281970977783, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.609375, "frac_reward_zero_std": 0.0, "grad_norm": 9.75, "kl": 0.00033565983176231384, "learning_rate": 4.0185185185185183e-07, "loss": 0.0, "num_tokens": 4211760.0, "reward": 0.32802748680114746, "reward_std": 0.0578024685382843, "rewards/grpo_reward_func/mean": 0.32802748680114746, "rewards/grpo_reward_func/std": 0.05809301882982254, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.625, "frac_reward_zero_std": 0.0, "grad_norm": 10.875, "kl": 0.0002027436494245194, "learning_rate": 4.009259259259259e-07, "loss": 0.0, "num_tokens": 4236952.0, "reward": 0.27089524269104004, "reward_std": 0.09902771562337875, "rewards/grpo_reward_func/mean": 0.27089524269104004, "rewards/grpo_reward_func/std": 0.1164289340376854, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.640625, "frac_reward_zero_std": 0.0, "grad_norm": 12.9375, "kl": 0.00028807235503336415, "learning_rate": 4e-07, "loss": 0.0, "num_tokens": 4262032.0, "reward": 0.2959464192390442, "reward_std": 0.08586762100458145, "rewards/grpo_reward_func/mean": 0.2959464192390442, "rewards/grpo_reward_func/std": 0.09414460510015488, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.65625, "frac_reward_zero_std": 0.0, "grad_norm": 9.9375, "kl": 0.0005565596075030044, "learning_rate": 3.990740740740741e-07, "loss": 0.0, "num_tokens": 4286880.0, "reward": 0.42600101232528687, "reward_std": 0.13071218132972717, "rewards/grpo_reward_func/mean": 0.42600101232528687, "rewards/grpo_reward_func/std": 0.1445237398147583, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.671875, "frac_reward_zero_std": 0.0, "grad_norm": 13.1875, "kl": 0.0007460726046701893, "learning_rate": 3.9814814814814813e-07, "loss": 0.0, "num_tokens": 4312808.0, "reward": 0.10033319890499115, "reward_std": 0.06247374042868614, "rewards/grpo_reward_func/mean": 0.10033319890499115, "rewards/grpo_reward_func/std": 0.06105644628405571, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.6875, "frac_reward_zero_std": 0.0, "grad_norm": 6.84375, "kl": 0.0002445072532282211, "learning_rate": 3.972222222222222e-07, "loss": 0.0, "num_tokens": 4337864.0, "reward": 0.2938867211341858, "reward_std": 0.11876720935106277, "rewards/grpo_reward_func/mean": 0.2938867211341858, "rewards/grpo_reward_func/std": 0.14251013100147247, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.703125, "frac_reward_zero_std": 0.0, "grad_norm": 12.8125, "kl": 0.0003134909420623444, "learning_rate": 3.962962962962963e-07, "loss": 0.0, "num_tokens": 4363144.0, "reward": 0.2597373127937317, "reward_std": 0.1320008635520935, "rewards/grpo_reward_func/mean": 0.2597373127937317, "rewards/grpo_reward_func/std": 0.14323653280735016, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.71875, "frac_reward_zero_std": 0.0, "grad_norm": 9.5, "kl": 0.0004200125113129616, "learning_rate": 3.9537037037037034e-07, "loss": 0.0, "num_tokens": 4387896.0, "reward": 0.3413686752319336, "reward_std": 0.11463446915149689, "rewards/grpo_reward_func/mean": 0.3413686752319336, "rewards/grpo_reward_func/std": 0.14729353785514832, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.734375, "frac_reward_zero_std": 0.0, "grad_norm": 9.625, "kl": 0.0007713943195994943, "learning_rate": 3.9444444444444444e-07, "loss": 0.0, "num_tokens": 4412888.0, "reward": 0.25845998525619507, "reward_std": 0.07939323782920837, "rewards/grpo_reward_func/mean": 0.25845998525619507, "rewards/grpo_reward_func/std": 0.21095220744609833, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.75, "frac_reward_zero_std": 0.0, "grad_norm": 8.25, "kl": 0.0008315810118801892, "learning_rate": 3.9351851851851854e-07, "loss": 0.0, "num_tokens": 4438472.0, "reward": 0.2494005262851715, "reward_std": 0.12670020759105682, "rewards/grpo_reward_func/mean": 0.2494005262851715, "rewards/grpo_reward_func/std": 0.24672208726406097, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.765625, "frac_reward_zero_std": 0.0, "grad_norm": 14.25, "kl": 0.0003384503797860816, "learning_rate": 3.925925925925926e-07, "loss": 0.0, "num_tokens": 4463536.0, "reward": 0.3772205710411072, "reward_std": 0.13446420431137085, "rewards/grpo_reward_func/mean": 0.3772205710411072, "rewards/grpo_reward_func/std": 0.13952113687992096, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.78125, "frac_reward_zero_std": 0.0, "grad_norm": 9.375, "kl": 0.0008698782767169178, "learning_rate": 3.9166666666666664e-07, "loss": 0.0, "num_tokens": 4489168.0, "reward": 0.25278976559638977, "reward_std": 0.13060712814331055, "rewards/grpo_reward_func/mean": 0.25278976559638977, "rewards/grpo_reward_func/std": 0.24315965175628662, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.796875, "frac_reward_zero_std": 0.0, "grad_norm": 11.5, "kl": 0.0002482995987520553, "learning_rate": 3.907407407407407e-07, "loss": 0.0, "num_tokens": 4514576.0, "reward": 0.26432496309280396, "reward_std": 0.10723777115345001, "rewards/grpo_reward_func/mean": 0.26432496309280396, "rewards/grpo_reward_func/std": 0.16780295968055725, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.8125, "frac_reward_zero_std": 0.0, "grad_norm": 11.75, "kl": 0.0007870141416788101, "learning_rate": 3.898148148148148e-07, "loss": 0.0, "num_tokens": 4539328.0, "reward": 0.40482616424560547, "reward_std": 0.16082629561424255, "rewards/grpo_reward_func/mean": 0.40482616424560547, "rewards/grpo_reward_func/std": 0.15713298320770264, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.828125, "frac_reward_zero_std": 0.0, "grad_norm": 8.125, "kl": 0.0006626859976677224, "learning_rate": 3.888888888888889e-07, "loss": 0.0, "num_tokens": 4564408.0, "reward": 0.35017871856689453, "reward_std": 0.11673957854509354, "rewards/grpo_reward_func/mean": 0.35017871856689453, "rewards/grpo_reward_func/std": 0.12969790399074554, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.84375, "frac_reward_zero_std": 0.0, "grad_norm": 19.625, "kl": 0.000518678076332435, "learning_rate": 3.8796296296296294e-07, "loss": 0.0, "num_tokens": 4589304.0, "reward": 0.29897165298461914, "reward_std": 0.1672011762857437, "rewards/grpo_reward_func/mean": 0.29897165298461914, "rewards/grpo_reward_func/std": 0.17964954674243927, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.859375, "frac_reward_zero_std": 0.0, "grad_norm": 13.1875, "kl": 0.0003269196895416826, "learning_rate": 3.87037037037037e-07, "loss": 0.0, "num_tokens": 4614184.0, "reward": 0.3464832901954651, "reward_std": 0.09929412603378296, "rewards/grpo_reward_func/mean": 0.3464832901954651, "rewards/grpo_reward_func/std": 0.1351390779018402, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.875, "frac_reward_zero_std": 0.0, "grad_norm": 13.6875, "kl": 0.0004685633030021563, "learning_rate": 3.861111111111111e-07, "loss": 0.0, "num_tokens": 4640008.0, "reward": 0.1745709925889969, "reward_std": 0.17333576083183289, "rewards/grpo_reward_func/mean": 0.1745709925889969, "rewards/grpo_reward_func/std": 0.1874336302280426, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.890625, "frac_reward_zero_std": 0.0, "grad_norm": 11.125, "kl": 0.00043425335024949163, "learning_rate": 3.8518518518518515e-07, "loss": 0.0, "num_tokens": 4665312.0, "reward": 0.28968238830566406, "reward_std": 0.16824908554553986, "rewards/grpo_reward_func/mean": 0.28968238830566406, "rewards/grpo_reward_func/std": 0.2248057723045349, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.90625, "frac_reward_zero_std": 0.0, "grad_norm": 11.0625, "kl": 0.0006378417165251449, "learning_rate": 3.8425925925925925e-07, "loss": 0.0, "num_tokens": 4690392.0, "reward": 0.397042453289032, "reward_std": 0.17139402031898499, "rewards/grpo_reward_func/mean": 0.397042453289032, "rewards/grpo_reward_func/std": 0.2513841390609741, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.921875, "frac_reward_zero_std": 0.0, "grad_norm": 11.6875, "kl": 0.0007361011957982555, "learning_rate": 3.8333333333333335e-07, "loss": 0.0, "num_tokens": 4714992.0, "reward": 0.29339537024497986, "reward_std": 0.09461888670921326, "rewards/grpo_reward_func/mean": 0.29339537024497986, "rewards/grpo_reward_func/std": 0.11325549334287643, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.9375, "frac_reward_zero_std": 0.0, "grad_norm": 24.375, "kl": 0.0004989129301975481, "learning_rate": 3.824074074074074e-07, "loss": 0.0, "num_tokens": 4739696.0, "reward": 0.38193440437316895, "reward_std": 0.15044079720973969, "rewards/grpo_reward_func/mean": 0.38193440437316895, "rewards/grpo_reward_func/std": 0.1526176482439041, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.953125, "frac_reward_zero_std": 0.0, "grad_norm": 12.875, "kl": 0.0008048738091019914, "learning_rate": 3.8148148148148145e-07, "loss": 0.0, "num_tokens": 4765432.0, "reward": 0.10808064788579941, "reward_std": 0.14257347583770752, "rewards/grpo_reward_func/mean": 0.10808064788579941, "rewards/grpo_reward_func/std": 0.1668616235256195, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.96875, "frac_reward_zero_std": 0.0, "grad_norm": 10.125, "kl": 0.0006854971870779991, "learning_rate": 3.805555555555555e-07, "loss": 0.0, "num_tokens": 4790728.0, "reward": 0.3299209475517273, "reward_std": 0.12696070969104767, "rewards/grpo_reward_func/mean": 0.3299209475517273, "rewards/grpo_reward_func/std": 0.14549556374549866, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.984375, "frac_reward_zero_std": 0.0, "grad_norm": 12.4375, "kl": 0.0005391652957769111, "learning_rate": 3.7962962962962966e-07, "loss": 0.0, "num_tokens": 4815896.0, "reward": 0.24173803627490997, "reward_std": 0.13923460245132446, "rewards/grpo_reward_func/mean": 0.24173803627490997, "rewards/grpo_reward_func/std": 0.13477925956249237, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.0, "frac_reward_zero_std": 0.0, "grad_norm": 8.375, "kl": 0.0010796443675644696, "learning_rate": 3.787037037037037e-07, "loss": 0.0, "num_tokens": 4841448.0, "reward": 0.1647602617740631, "reward_std": 0.14355677366256714, "rewards/grpo_reward_func/mean": 0.1647602617740631, "rewards/grpo_reward_func/std": 0.15943202376365662, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.015625, "frac_reward_zero_std": 0.0, "grad_norm": 9.5, "kl": 0.00045456798397935927, "learning_rate": 3.7777777777777775e-07, "loss": 0.0, "num_tokens": 4866200.0, "reward": 0.34182441234588623, "reward_std": 0.1141589879989624, "rewards/grpo_reward_func/mean": 0.34182441234588623, "rewards/grpo_reward_func/std": 0.13174206018447876, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.03125, "frac_reward_zero_std": 0.0, "grad_norm": 7.59375, "kl": 0.0007237400859594345, "learning_rate": 3.768518518518518e-07, "loss": 0.0, "num_tokens": 4891192.0, "reward": 0.3446623384952545, "reward_std": 0.08384630084037781, "rewards/grpo_reward_func/mean": 0.3446623384952545, "rewards/grpo_reward_func/std": 0.08680541068315506, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.046875, "frac_reward_zero_std": 0.0, "grad_norm": 7.6875, "kl": 0.000518413566169329, "learning_rate": 3.759259259259259e-07, "loss": 0.0, "num_tokens": 4916504.0, "reward": 0.2807949185371399, "reward_std": 0.10580653697252274, "rewards/grpo_reward_func/mean": 0.2807949185371399, "rewards/grpo_reward_func/std": 0.19993965327739716, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.0625, "frac_reward_zero_std": 0.0, "grad_norm": 8.4375, "kl": 0.000430591702752281, "learning_rate": 3.75e-07, "loss": 0.0, "num_tokens": 4942040.0, "reward": 0.17394839227199554, "reward_std": 0.06569032371044159, "rewards/grpo_reward_func/mean": 0.17394839227199554, "rewards/grpo_reward_func/std": 0.19397369027137756, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.078125, "frac_reward_zero_std": 0.0, "grad_norm": 16.75, "kl": 0.00039455325168091804, "learning_rate": 3.7407407407407406e-07, "loss": 0.0, "num_tokens": 4967200.0, "reward": 0.4116261601448059, "reward_std": 0.18030327558517456, "rewards/grpo_reward_func/mean": 0.4116261601448059, "rewards/grpo_reward_func/std": 0.2310413271188736, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.09375, "frac_reward_zero_std": 0.0, "grad_norm": 9.875, "kl": 0.0005141782166901976, "learning_rate": 3.7314814814814816e-07, "loss": 0.0, "num_tokens": 4992016.0, "reward": 0.46965691447257996, "reward_std": 0.1314663141965866, "rewards/grpo_reward_func/mean": 0.46965691447257996, "rewards/grpo_reward_func/std": 0.17656759917736053, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.109375, "frac_reward_zero_std": 0.0, "grad_norm": 11.4375, "kl": 0.0005590068249148317, "learning_rate": 3.722222222222222e-07, "loss": 0.0, "num_tokens": 5016520.0, "reward": 0.4341745972633362, "reward_std": 0.10189318656921387, "rewards/grpo_reward_func/mean": 0.4341745972633362, "rewards/grpo_reward_func/std": 0.21076862514019012, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.125, "frac_reward_zero_std": 0.0, "grad_norm": 9.25, "kl": 0.00043097294110339135, "learning_rate": 3.7129629629629626e-07, "loss": 0.0, "num_tokens": 5041600.0, "reward": 0.19751757383346558, "reward_std": 0.10619483888149261, "rewards/grpo_reward_func/mean": 0.19751757383346558, "rewards/grpo_reward_func/std": 0.19493362307548523, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.140625, "frac_reward_zero_std": 0.0, "grad_norm": 14.0, "kl": 0.0008377966587431729, "learning_rate": 3.703703703703703e-07, "loss": 0.0, "num_tokens": 5066544.0, "reward": 0.2634657025337219, "reward_std": 0.09939266741275787, "rewards/grpo_reward_func/mean": 0.2634657025337219, "rewards/grpo_reward_func/std": 0.11164335906505585, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.15625, "frac_reward_zero_std": 0.0, "grad_norm": 8.8125, "kl": 0.0007731840014457703, "learning_rate": 3.6944444444444447e-07, "loss": 0.0, "num_tokens": 5092384.0, "reward": 0.09646777808666229, "reward_std": 0.06861913204193115, "rewards/grpo_reward_func/mean": 0.09646777808666229, "rewards/grpo_reward_func/std": 0.07161495089530945, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.171875, "frac_reward_zero_std": 0.0, "grad_norm": 9.625, "kl": 0.0002781056537060067, "learning_rate": 3.685185185185185e-07, "loss": 0.0, "num_tokens": 5117632.0, "reward": 0.3059152364730835, "reward_std": 0.15640440583229065, "rewards/grpo_reward_func/mean": 0.3059152364730835, "rewards/grpo_reward_func/std": 0.25642770528793335, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.1875, "frac_reward_zero_std": 0.0, "grad_norm": 9.1875, "kl": 0.0006508687511086464, "learning_rate": 3.6759259259259257e-07, "loss": 0.0, "num_tokens": 5142432.0, "reward": 0.4996418356895447, "reward_std": 0.13480040431022644, "rewards/grpo_reward_func/mean": 0.4996418356895447, "rewards/grpo_reward_func/std": 0.14277315139770508, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.203125, "frac_reward_zero_std": 0.0, "grad_norm": 13.6875, "kl": 0.000795925036072731, "learning_rate": 3.666666666666666e-07, "loss": 0.0, "num_tokens": 5167304.0, "reward": 0.42619913816452026, "reward_std": 0.1849099099636078, "rewards/grpo_reward_func/mean": 0.42619913816452026, "rewards/grpo_reward_func/std": 0.18658678233623505, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.21875, "frac_reward_zero_std": 0.0, "grad_norm": 7.0625, "kl": 0.0011986760946456343, "learning_rate": 3.657407407407407e-07, "loss": 0.0, "num_tokens": 5192104.0, "reward": 0.4016791880130768, "reward_std": 0.07631438970565796, "rewards/grpo_reward_func/mean": 0.4016791880130768, "rewards/grpo_reward_func/std": 0.14636240899562836, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.234375, "frac_reward_zero_std": 0.0, "grad_norm": 13.125, "kl": 0.0007918803166830912, "learning_rate": 3.648148148148148e-07, "loss": 0.0, "num_tokens": 5217048.0, "reward": 0.3634570837020874, "reward_std": 0.13550926744937897, "rewards/grpo_reward_func/mean": 0.3634570837020874, "rewards/grpo_reward_func/std": 0.1402992159128189, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.25, "frac_reward_zero_std": 0.0, "grad_norm": 11.5625, "kl": 0.0005829473811900243, "learning_rate": 3.6388888888888887e-07, "loss": 0.0, "num_tokens": 5242296.0, "reward": 0.30222201347351074, "reward_std": 0.14429670572280884, "rewards/grpo_reward_func/mean": 0.30222201347351074, "rewards/grpo_reward_func/std": 0.15859085321426392, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.265625, "frac_reward_zero_std": 0.0, "grad_norm": 7.40625, "kl": 0.00040928709495346993, "learning_rate": 3.6296296296296297e-07, "loss": 0.0, "num_tokens": 5267656.0, "reward": 0.2286316603422165, "reward_std": 0.09120648354291916, "rewards/grpo_reward_func/mean": 0.2286316603422165, "rewards/grpo_reward_func/std": 0.21030573546886444, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.28125, "frac_reward_zero_std": 0.0, "grad_norm": 8.8125, "kl": 0.0006142336205812171, "learning_rate": 3.62037037037037e-07, "loss": 0.0, "num_tokens": 5293176.0, "reward": 0.14809495210647583, "reward_std": 0.14708967506885529, "rewards/grpo_reward_func/mean": 0.14809495210647583, "rewards/grpo_reward_func/std": 0.16517038643360138, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.296875, "frac_reward_zero_std": 0.0, "grad_norm": 10.8125, "kl": 0.0008529710030416027, "learning_rate": 3.6111111111111107e-07, "loss": 0.0, "num_tokens": 5318936.0, "reward": 0.17756229639053345, "reward_std": 0.058169350028038025, "rewards/grpo_reward_func/mean": 0.17756229639053345, "rewards/grpo_reward_func/std": 0.13397441804409027, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.3125, "frac_reward_zero_std": 0.0, "grad_norm": 9.1875, "kl": 0.0003407594340387732, "learning_rate": 3.601851851851852e-07, "loss": 0.0, "num_tokens": 5344008.0, "reward": 0.36280357837677, "reward_std": 0.09298541396856308, "rewards/grpo_reward_func/mean": 0.36280357837677, "rewards/grpo_reward_func/std": 0.09538479149341583, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.328125, "frac_reward_zero_std": 0.0, "grad_norm": 11.375, "kl": 0.0005417931824922562, "learning_rate": 3.592592592592593e-07, "loss": 0.0, "num_tokens": 5368960.0, "reward": 0.47765880823135376, "reward_std": 0.1049264445900917, "rewards/grpo_reward_func/mean": 0.47765880823135376, "rewards/grpo_reward_func/std": 0.12036207318305969, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.9375, "completions/mean_terminated_length": 11.9375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.34375, "frac_reward_zero_std": 0.0, "grad_norm": 10.0, "kl": 0.0007974399486556649, "learning_rate": 3.583333333333333e-07, "loss": -0.005, "num_tokens": 5394727.0, "reward": 0.16735509037971497, "reward_std": 0.0997590720653534, "rewards/grpo_reward_func/mean": 0.16735509037971497, "rewards/grpo_reward_func/std": 0.12222032994031906, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.359375, "frac_reward_zero_std": 0.0, "grad_norm": 16.5, "kl": 0.0013814661651849747, "learning_rate": 3.574074074074074e-07, "loss": 0.0001, "num_tokens": 5419783.0, "reward": 0.3473682999610901, "reward_std": 0.08365271985530853, "rewards/grpo_reward_func/mean": 0.3473682999610901, "rewards/grpo_reward_func/std": 0.10378436744213104, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.375, "frac_reward_zero_std": 0.0, "grad_norm": 14.875, "kl": 0.000576346181333065, "learning_rate": 3.564814814814814e-07, "loss": 0.0, "num_tokens": 5445191.0, "reward": 0.3035712242126465, "reward_std": 0.1296510398387909, "rewards/grpo_reward_func/mean": 0.3035712242126465, "rewards/grpo_reward_func/std": 0.2325069159269333, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.390625, "frac_reward_zero_std": 0.0, "grad_norm": 5.4375, "kl": 0.0003792364223045297, "learning_rate": 3.5555555555555553e-07, "loss": 0.0, "num_tokens": 5471263.0, "reward": 0.08952207118272781, "reward_std": 0.060667045414447784, "rewards/grpo_reward_func/mean": 0.08952207118272781, "rewards/grpo_reward_func/std": 0.061015550047159195, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.40625, "frac_reward_zero_std": 0.0, "grad_norm": 10.6875, "kl": 0.00032404749072156847, "learning_rate": 3.5462962962962963e-07, "loss": 0.0, "num_tokens": 5496703.0, "reward": 0.0992613434791565, "reward_std": 0.07703530788421631, "rewards/grpo_reward_func/mean": 0.0992613434791565, "rewards/grpo_reward_func/std": 0.145080104470253, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.421875, "frac_reward_zero_std": 0.0, "grad_norm": 12.3125, "kl": 0.0006244319229153916, "learning_rate": 3.537037037037037e-07, "loss": 0.0, "num_tokens": 5521919.0, "reward": 0.187003493309021, "reward_std": 0.11275693774223328, "rewards/grpo_reward_func/mean": 0.187003493309021, "rewards/grpo_reward_func/std": 0.1791585236787796, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.4375, "frac_reward_zero_std": 0.0, "grad_norm": 12.6875, "kl": 0.0013147607969585806, "learning_rate": 3.527777777777778e-07, "loss": 0.0001, "num_tokens": 5547479.0, "reward": 0.23747900128364563, "reward_std": 0.1416703164577484, "rewards/grpo_reward_func/mean": 0.23747900128364563, "rewards/grpo_reward_func/std": 0.26003557443618774, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.453125, "frac_reward_zero_std": 0.0, "grad_norm": 9.75, "kl": 0.001335889071924612, "learning_rate": 3.5185185185185183e-07, "loss": 0.0001, "num_tokens": 5573023.0, "reward": 0.3594636619091034, "reward_std": 0.10562098026275635, "rewards/grpo_reward_func/mean": 0.3594636619091034, "rewards/grpo_reward_func/std": 0.2661304175853729, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.46875, "frac_reward_zero_std": 0.0, "grad_norm": 15.25, "kl": 0.0004436932358657941, "learning_rate": 3.509259259259259e-07, "loss": 0.0, "num_tokens": 5598591.0, "reward": 0.15536442399024963, "reward_std": 0.09705634415149689, "rewards/grpo_reward_func/mean": 0.15536442399024963, "rewards/grpo_reward_func/std": 0.177720844745636, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.484375, "frac_reward_zero_std": 0.0, "grad_norm": 4.28125, "kl": 0.00038727434002794325, "learning_rate": 3.5e-07, "loss": 0.0, "num_tokens": 5623975.0, "reward": 0.2250569462776184, "reward_std": 0.043075259774923325, "rewards/grpo_reward_func/mean": 0.2250569462776184, "rewards/grpo_reward_func/std": 0.15976740419864655, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.5, "frac_reward_zero_std": 0.0, "grad_norm": 11.0, "kl": 0.0007129740115487948, "learning_rate": 3.490740740740741e-07, "loss": 0.0, "num_tokens": 5649511.0, "reward": 0.22658474743366241, "reward_std": 0.07546912878751755, "rewards/grpo_reward_func/mean": 0.22658474743366241, "rewards/grpo_reward_func/std": 0.18879064917564392, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.515625, "frac_reward_zero_std": 0.0, "grad_norm": 8.9375, "kl": 0.00010857979577849619, "learning_rate": 3.4814814814814814e-07, "loss": 0.0, "num_tokens": 5674447.0, "reward": 0.33255600929260254, "reward_std": 0.15443569421768188, "rewards/grpo_reward_func/mean": 0.33255600929260254, "rewards/grpo_reward_func/std": 0.1559605747461319, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.53125, "frac_reward_zero_std": 0.0, "grad_norm": 12.0625, "kl": 0.0010417526063974947, "learning_rate": 3.472222222222222e-07, "loss": 0.0, "num_tokens": 5699567.0, "reward": 0.30261072516441345, "reward_std": 0.06423477828502655, "rewards/grpo_reward_func/mean": 0.30261072516441345, "rewards/grpo_reward_func/std": 0.12586970627307892, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.546875, "frac_reward_zero_std": 0.0, "grad_norm": 8.875, "kl": 0.001145510614151135, "learning_rate": 3.4629629629629624e-07, "loss": 0.0, "num_tokens": 5725159.0, "reward": 0.19406265020370483, "reward_std": 0.10304485261440277, "rewards/grpo_reward_func/mean": 0.19406265020370483, "rewards/grpo_reward_func/std": 0.16005179286003113, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.5625, "frac_reward_zero_std": 0.0, "grad_norm": 10.8125, "kl": 0.0006747040679329075, "learning_rate": 3.453703703703704e-07, "loss": 0.0, "num_tokens": 5750279.0, "reward": 0.2185659408569336, "reward_std": 0.08110688626766205, "rewards/grpo_reward_func/mean": 0.2185659408569336, "rewards/grpo_reward_func/std": 0.10881200432777405, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.578125, "frac_reward_zero_std": 0.0, "grad_norm": 6.625, "kl": 0.00036149504012428224, "learning_rate": 3.4444444444444444e-07, "loss": 0.0, "num_tokens": 5775775.0, "reward": 0.252541720867157, "reward_std": 0.14369598031044006, "rewards/grpo_reward_func/mean": 0.252541720867157, "rewards/grpo_reward_func/std": 0.2099451869726181, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.59375, "frac_reward_zero_std": 0.0, "grad_norm": 13.625, "kl": 0.001367637887597084, "learning_rate": 3.435185185185185e-07, "loss": 0.0001, "num_tokens": 5801039.0, "reward": 0.2519097328186035, "reward_std": 0.1605014055967331, "rewards/grpo_reward_func/mean": 0.2519097328186035, "rewards/grpo_reward_func/std": 0.26890748739242554, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.609375, "frac_reward_zero_std": 0.0, "grad_norm": 8.6875, "kl": 0.0006766791047994047, "learning_rate": 3.425925925925926e-07, "loss": 0.0, "num_tokens": 5825991.0, "reward": 0.3935144543647766, "reward_std": 0.12401822954416275, "rewards/grpo_reward_func/mean": 0.3935144543647766, "rewards/grpo_reward_func/std": 0.1281329244375229, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.625, "frac_reward_zero_std": 0.0, "grad_norm": 11.6875, "kl": 0.0010292638908140361, "learning_rate": 3.4166666666666664e-07, "loss": 0.0, "num_tokens": 5850967.0, "reward": 0.3945986032485962, "reward_std": 0.0977005809545517, "rewards/grpo_reward_func/mean": 0.3945986032485962, "rewards/grpo_reward_func/std": 0.146220862865448, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.640625, "frac_reward_zero_std": 0.0, "grad_norm": 7.84375, "kl": 0.0007942042720969766, "learning_rate": 3.407407407407407e-07, "loss": 0.0, "num_tokens": 5876775.0, "reward": 0.1004338338971138, "reward_std": 0.12970568239688873, "rewards/grpo_reward_func/mean": 0.1004338338971138, "rewards/grpo_reward_func/std": 0.1417793482542038, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.65625, "frac_reward_zero_std": 0.0, "grad_norm": 23.375, "kl": 0.0014751525595784187, "learning_rate": 3.398148148148148e-07, "loss": 0.0001, "num_tokens": 5902023.0, "reward": 0.22899229824543, "reward_std": 0.10198648273944855, "rewards/grpo_reward_func/mean": 0.22899229824543, "rewards/grpo_reward_func/std": 0.13079826533794403, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.671875, "frac_reward_zero_std": 0.0, "grad_norm": 5.46875, "kl": 0.000608055226621218, "learning_rate": 3.388888888888889e-07, "loss": 0.0, "num_tokens": 5927263.0, "reward": 0.29941701889038086, "reward_std": 0.06719333678483963, "rewards/grpo_reward_func/mean": 0.29941701889038086, "rewards/grpo_reward_func/std": 0.14349378645420074, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.6875, "frac_reward_zero_std": 0.0, "grad_norm": 12.875, "kl": 0.0006101805192884058, "learning_rate": 3.3796296296296295e-07, "loss": 0.0, "num_tokens": 5951983.0, "reward": 0.4177182912826538, "reward_std": 0.15579620003700256, "rewards/grpo_reward_func/mean": 0.4177182912826538, "rewards/grpo_reward_func/std": 0.15117469429969788, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.703125, "frac_reward_zero_std": 0.0, "grad_norm": 14.9375, "kl": 0.0004981622769264504, "learning_rate": 3.37037037037037e-07, "loss": 0.0, "num_tokens": 5976487.0, "reward": 0.3275076150894165, "reward_std": 0.16276490688323975, "rewards/grpo_reward_func/mean": 0.3275076150894165, "rewards/grpo_reward_func/std": 0.1577589213848114, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.71875, "frac_reward_zero_std": 0.0, "grad_norm": 20.0, "kl": 0.0013415751745924354, "learning_rate": 3.361111111111111e-07, "loss": 0.0001, "num_tokens": 6001479.0, "reward": 0.37890833616256714, "reward_std": 0.18145695328712463, "rewards/grpo_reward_func/mean": 0.37890833616256714, "rewards/grpo_reward_func/std": 0.17795169353485107, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.734375, "frac_reward_zero_std": 0.0, "grad_norm": 13.0, "kl": 0.0011636121198534966, "learning_rate": 3.351851851851852e-07, "loss": 0.0, "num_tokens": 6027183.0, "reward": 0.1332564800977707, "reward_std": 0.224045991897583, "rewards/grpo_reward_func/mean": 0.1332564800977707, "rewards/grpo_reward_func/std": 0.21909579634666443, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.75, "frac_reward_zero_std": 0.0, "grad_norm": 13.8125, "kl": 0.0012435338867362589, "learning_rate": 3.3425925925925925e-07, "loss": 0.0, "num_tokens": 6052111.0, "reward": 0.27287042140960693, "reward_std": 0.15101364254951477, "rewards/grpo_reward_func/mean": 0.27287042140960693, "rewards/grpo_reward_func/std": 0.2304336130619049, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.765625, "frac_reward_zero_std": 0.0, "grad_norm": 15.25, "kl": 0.0005685510259354487, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "num_tokens": 6076831.0, "reward": 0.47098228335380554, "reward_std": 0.08635647594928741, "rewards/grpo_reward_func/mean": 0.47098228335380554, "rewards/grpo_reward_func/std": 0.10293111950159073, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.78125, "frac_reward_zero_std": 0.0, "grad_norm": 5.78125, "kl": 0.0018707392737269402, "learning_rate": 3.324074074074074e-07, "loss": 0.0001, "num_tokens": 6102039.0, "reward": 0.36166447401046753, "reward_std": 0.06935366988182068, "rewards/grpo_reward_func/mean": 0.36166447401046753, "rewards/grpo_reward_func/std": 0.134328693151474, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.796875, "frac_reward_zero_std": 0.0, "grad_norm": 16.25, "kl": 0.0011540545820025727, "learning_rate": 3.3148148148148145e-07, "loss": 0.0, "num_tokens": 6126983.0, "reward": 0.3171887695789337, "reward_std": 0.08502347022294998, "rewards/grpo_reward_func/mean": 0.3171887695789337, "rewards/grpo_reward_func/std": 0.09260429441928864, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.8125, "frac_reward_zero_std": 0.0, "grad_norm": 11.1875, "kl": 0.0011697566660586745, "learning_rate": 3.3055555555555556e-07, "loss": 0.0, "num_tokens": 6152295.0, "reward": 0.2625499367713928, "reward_std": 0.09874355047941208, "rewards/grpo_reward_func/mean": 0.2625499367713928, "rewards/grpo_reward_func/std": 0.2084723711013794, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.828125, "frac_reward_zero_std": 0.0, "grad_norm": 5.125, "kl": 0.0007433524879161268, "learning_rate": 3.296296296296296e-07, "loss": 0.0, "num_tokens": 6177527.0, "reward": 0.27035290002822876, "reward_std": 0.04757823050022125, "rewards/grpo_reward_func/mean": 0.27035290002822876, "rewards/grpo_reward_func/std": 0.08030013740062714, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.84375, "frac_reward_zero_std": 0.0, "grad_norm": 11.625, "kl": 0.0008057684899540618, "learning_rate": 3.287037037037037e-07, "loss": 0.0, "num_tokens": 6202639.0, "reward": 0.3587920367717743, "reward_std": 0.16274358332157135, "rewards/grpo_reward_func/mean": 0.3587920367717743, "rewards/grpo_reward_func/std": 0.1588505208492279, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.859375, "frac_reward_zero_std": 0.0, "grad_norm": 9.125, "kl": 0.0006251692102523521, "learning_rate": 3.2777777777777776e-07, "loss": 0.0, "num_tokens": 6227983.0, "reward": 0.2578817307949066, "reward_std": 0.14186282455921173, "rewards/grpo_reward_func/mean": 0.2578817307949066, "rewards/grpo_reward_func/std": 0.23546750843524933, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.875, "frac_reward_zero_std": 0.0, "grad_norm": 14.6875, "kl": 0.000663579732645303, "learning_rate": 3.268518518518518e-07, "loss": 0.0, "num_tokens": 6254367.0, "reward": 0.0007088836282491684, "reward_std": 0.09014703333377838, "rewards/grpo_reward_func/mean": 0.0007088836282491684, "rewards/grpo_reward_func/std": 0.09519969671964645, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.890625, "frac_reward_zero_std": 0.0, "grad_norm": 9.0, "kl": 0.0010352494718972594, "learning_rate": 3.2592592592592596e-07, "loss": 0.0, "num_tokens": 6279463.0, "reward": 0.3615862727165222, "reward_std": 0.10009762644767761, "rewards/grpo_reward_func/mean": 0.3615862727165222, "rewards/grpo_reward_func/std": 0.108461894094944, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.90625, "frac_reward_zero_std": 0.0, "grad_norm": 4.6875, "kl": 0.0012366212613414973, "learning_rate": 3.25e-07, "loss": 0.0, "num_tokens": 6304359.0, "reward": 0.32284611463546753, "reward_std": 0.049088191241025925, "rewards/grpo_reward_func/mean": 0.32284611463546753, "rewards/grpo_reward_func/std": 0.07815742492675781, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.921875, "frac_reward_zero_std": 0.0, "grad_norm": 12.625, "kl": 0.0005682266055373475, "learning_rate": 3.2407407407407406e-07, "loss": 0.0, "num_tokens": 6329879.0, "reward": 0.1265973150730133, "reward_std": 0.10245135426521301, "rewards/grpo_reward_func/mean": 0.1265973150730133, "rewards/grpo_reward_func/std": 0.14798611402511597, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.9375, "frac_reward_zero_std": 0.0, "grad_norm": 6.34375, "kl": 0.0005744351219618693, "learning_rate": 3.231481481481481e-07, "loss": 0.0, "num_tokens": 6354855.0, "reward": 0.3396564722061157, "reward_std": 0.05385906249284744, "rewards/grpo_reward_func/mean": 0.3396564722061157, "rewards/grpo_reward_func/std": 0.052948247641325, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.953125, "frac_reward_zero_std": 0.0, "grad_norm": 11.3125, "kl": 0.0011231331154704094, "learning_rate": 3.222222222222222e-07, "loss": 0.0, "num_tokens": 6379831.0, "reward": 0.3173307776451111, "reward_std": 0.07542085647583008, "rewards/grpo_reward_func/mean": 0.3173307776451111, "rewards/grpo_reward_func/std": 0.10744292289018631, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.96875, "frac_reward_zero_std": 0.0, "grad_norm": 10.8125, "kl": 0.0005484645516844466, "learning_rate": 3.2129629629629626e-07, "loss": 0.0, "num_tokens": 6404911.0, "reward": 0.3487330675125122, "reward_std": 0.1991029679775238, "rewards/grpo_reward_func/mean": 0.3487330675125122, "rewards/grpo_reward_func/std": 0.23129069805145264, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.984375, "frac_reward_zero_std": 0.0, "grad_norm": 11.75, "kl": 0.0005995733808958903, "learning_rate": 3.2037037037037037e-07, "loss": 0.0, "num_tokens": 6429919.0, "reward": 0.27099573612213135, "reward_std": 0.12892566621303558, "rewards/grpo_reward_func/mean": 0.27099573612213135, "rewards/grpo_reward_func/std": 0.1356169879436493, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.0, "frac_reward_zero_std": 0.0, "grad_norm": 9.0, "kl": 0.0009363433055114001, "learning_rate": 3.194444444444444e-07, "loss": 0.0, "num_tokens": 6455263.0, "reward": 0.2379104644060135, "reward_std": 0.13427025079727173, "rewards/grpo_reward_func/mean": 0.2379104644060135, "rewards/grpo_reward_func/std": 0.15128843486309052, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.015625, "frac_reward_zero_std": 0.0, "grad_norm": 12.8125, "kl": 0.0011603829334490001, "learning_rate": 3.185185185185185e-07, "loss": 0.0, "num_tokens": 6480735.0, "reward": 0.2011098861694336, "reward_std": 0.1447315365076065, "rewards/grpo_reward_func/mean": 0.2011098861694336, "rewards/grpo_reward_func/std": 0.2299196422100067, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.03125, "frac_reward_zero_std": 0.0, "grad_norm": 12.1875, "kl": 0.0019025284273084253, "learning_rate": 3.1759259259259257e-07, "loss": 0.0001, "num_tokens": 6505487.0, "reward": 0.38076773285865784, "reward_std": 0.062072522938251495, "rewards/grpo_reward_func/mean": 0.38076773285865784, "rewards/grpo_reward_func/std": 0.060071974992752075, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.046875, "frac_reward_zero_std": 0.0, "grad_norm": 10.25, "kl": 0.0004512484447332099, "learning_rate": 3.166666666666666e-07, "loss": 0.0, "num_tokens": 6530567.0, "reward": 0.24981309473514557, "reward_std": 0.1037866473197937, "rewards/grpo_reward_func/mean": 0.24981309473514557, "rewards/grpo_reward_func/std": 0.14956361055374146, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.0625, "frac_reward_zero_std": 0.0, "grad_norm": 8.625, "kl": 0.0010139914229512215, "learning_rate": 3.1574074074074077e-07, "loss": 0.0, "num_tokens": 6555831.0, "reward": 0.21904444694519043, "reward_std": 0.07102406024932861, "rewards/grpo_reward_func/mean": 0.21904444694519043, "rewards/grpo_reward_func/std": 0.17433929443359375, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.078125, "frac_reward_zero_std": 0.0, "grad_norm": 13.9375, "kl": 0.0008480849792249501, "learning_rate": 3.148148148148148e-07, "loss": 0.0, "num_tokens": 6580823.0, "reward": 0.3976954519748688, "reward_std": 0.14704205095767975, "rewards/grpo_reward_func/mean": 0.3976954519748688, "rewards/grpo_reward_func/std": 0.14226453006267548, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.09375, "frac_reward_zero_std": 0.0, "grad_norm": 13.4375, "kl": 0.0018108648364432156, "learning_rate": 3.1388888888888887e-07, "loss": 0.0001, "num_tokens": 6605631.0, "reward": 0.27959388494491577, "reward_std": 0.1226632297039032, "rewards/grpo_reward_func/mean": 0.27959388494491577, "rewards/grpo_reward_func/std": 0.13256776332855225, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.109375, "frac_reward_zero_std": 0.0, "grad_norm": 8.875, "kl": 0.0017761494382284582, "learning_rate": 3.129629629629629e-07, "loss": 0.0001, "num_tokens": 6630615.0, "reward": 0.3720259368419647, "reward_std": 0.08527237176895142, "rewards/grpo_reward_func/mean": 0.3720259368419647, "rewards/grpo_reward_func/std": 0.09316051751375198, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.125, "frac_reward_zero_std": 0.0, "grad_norm": 9.5625, "kl": 0.0005580664874287322, "learning_rate": 3.12037037037037e-07, "loss": 0.0, "num_tokens": 6656335.0, "reward": 0.120351143181324, "reward_std": 0.06724663823843002, "rewards/grpo_reward_func/mean": 0.120351143181324, "rewards/grpo_reward_func/std": 0.12132058292627335, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.140625, "frac_reward_zero_std": 0.0, "grad_norm": 10.0, "kl": 0.0005984306335449219, "learning_rate": 3.111111111111111e-07, "loss": 0.0, "num_tokens": 6681695.0, "reward": 0.21656858921051025, "reward_std": 0.10678647458553314, "rewards/grpo_reward_func/mean": 0.21656858921051025, "rewards/grpo_reward_func/std": 0.15582096576690674, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.15625, "frac_reward_zero_std": 0.0, "grad_norm": 12.8125, "kl": 0.0001807762309908867, "learning_rate": 3.101851851851852e-07, "loss": 0.0, "num_tokens": 6707239.0, "reward": 0.213043212890625, "reward_std": 0.09147733449935913, "rewards/grpo_reward_func/mean": 0.213043212890625, "rewards/grpo_reward_func/std": 0.1813676506280899, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.171875, "frac_reward_zero_std": 0.0, "grad_norm": 12.75, "kl": 0.0008879285014700145, "learning_rate": 3.092592592592592e-07, "loss": 0.0, "num_tokens": 6732487.0, "reward": 0.222677081823349, "reward_std": 0.07209749519824982, "rewards/grpo_reward_func/mean": 0.222677081823349, "rewards/grpo_reward_func/std": 0.11454568058252335, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.1875, "frac_reward_zero_std": 0.0, "grad_norm": 10.4375, "kl": 0.0009467930940445513, "learning_rate": 3.0833333333333333e-07, "loss": 0.0, "num_tokens": 6758015.0, "reward": 0.25332149863243103, "reward_std": 0.12080815434455872, "rewards/grpo_reward_func/mean": 0.25332149863243103, "rewards/grpo_reward_func/std": 0.18324896693229675, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.203125, "frac_reward_zero_std": 0.0, "grad_norm": 15.4375, "kl": 0.0011847288988064975, "learning_rate": 3.074074074074074e-07, "loss": 0.0, "num_tokens": 6783375.0, "reward": 0.1880032867193222, "reward_std": 0.07594156265258789, "rewards/grpo_reward_func/mean": 0.1880032867193222, "rewards/grpo_reward_func/std": 0.14374983310699463, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.21875, "frac_reward_zero_std": 0.0, "grad_norm": 10.375, "kl": 0.0004900420753983781, "learning_rate": 3.0648148148148143e-07, "loss": 0.0, "num_tokens": 6808471.0, "reward": 0.3218265175819397, "reward_std": 0.07450239360332489, "rewards/grpo_reward_func/mean": 0.3218265175819397, "rewards/grpo_reward_func/std": 0.09696881473064423, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.234375, "frac_reward_zero_std": 0.0, "grad_norm": 13.5, "kl": 0.0003609297127695754, "learning_rate": 3.055555555555556e-07, "loss": 0.0, "num_tokens": 6833423.0, "reward": 0.4676928222179413, "reward_std": 0.11454892158508301, "rewards/grpo_reward_func/mean": 0.4676928222179413, "rewards/grpo_reward_func/std": 0.12146926671266556, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.25, "frac_reward_zero_std": 0.0, "grad_norm": 10.6875, "kl": 0.000912386312847957, "learning_rate": 3.0462962962962963e-07, "loss": 0.0, "num_tokens": 6858519.0, "reward": 0.2522871196269989, "reward_std": 0.1315116584300995, "rewards/grpo_reward_func/mean": 0.2522871196269989, "rewards/grpo_reward_func/std": 0.18651296198368073, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.265625, "frac_reward_zero_std": 0.0, "grad_norm": 11.4375, "kl": 0.0004488583654165268, "learning_rate": 3.037037037037037e-07, "loss": 0.0, "num_tokens": 6883855.0, "reward": 0.25175750255584717, "reward_std": 0.11373959481716156, "rewards/grpo_reward_func/mean": 0.25175750255584717, "rewards/grpo_reward_func/std": 0.18083377182483673, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.28125, "frac_reward_zero_std": 0.0, "grad_norm": 10.0, "kl": 0.0008468221349176019, "learning_rate": 3.0277777777777773e-07, "loss": 0.0, "num_tokens": 6909191.0, "reward": 0.28342726826667786, "reward_std": 0.16340333223342896, "rewards/grpo_reward_func/mean": 0.28342726826667786, "rewards/grpo_reward_func/std": 0.19860301911830902, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.296875, "frac_reward_zero_std": 0.0, "grad_norm": 16.25, "kl": 0.0004914179589832202, "learning_rate": 3.0185185185185183e-07, "loss": 0.0, "num_tokens": 6934383.0, "reward": 0.3115350604057312, "reward_std": 0.13271969556808472, "rewards/grpo_reward_func/mean": 0.3115350604057312, "rewards/grpo_reward_func/std": 0.14835800230503082, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.3125, "frac_reward_zero_std": 0.0, "grad_norm": 6.03125, "kl": 0.0005301498022163287, "learning_rate": 3.0092592592592594e-07, "loss": 0.0, "num_tokens": 6959679.0, "reward": 0.22619304060935974, "reward_std": 0.052596937865018845, "rewards/grpo_reward_func/mean": 0.22619304060935974, "rewards/grpo_reward_func/std": 0.06435148417949677, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.328125, "frac_reward_zero_std": 0.0, "grad_norm": 7.5, "kl": 0.0004184702556813136, "learning_rate": 3e-07, "loss": 0.0, "num_tokens": 6984911.0, "reward": 0.2275838553905487, "reward_std": 0.07728221267461777, "rewards/grpo_reward_func/mean": 0.2275838553905487, "rewards/grpo_reward_func/std": 0.11855830997228622, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.34375, "frac_reward_zero_std": 0.0, "grad_norm": 12.9375, "kl": 0.0025757864059414715, "learning_rate": 2.9907407407407404e-07, "loss": 0.0001, "num_tokens": 7009823.0, "reward": 0.37820249795913696, "reward_std": 0.10672685503959656, "rewards/grpo_reward_func/mean": 0.37820249795913696, "rewards/grpo_reward_func/std": 0.1123933345079422, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.359375, "frac_reward_zero_std": 0.0, "grad_norm": 9.0625, "kl": 0.0011405842669773847, "learning_rate": 2.9814814814814814e-07, "loss": 0.0, "num_tokens": 7035151.0, "reward": 0.0609009750187397, "reward_std": 0.05970199033617973, "rewards/grpo_reward_func/mean": 0.0609009750187397, "rewards/grpo_reward_func/std": 0.0622292160987854, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.375, "frac_reward_zero_std": 0.0, "grad_norm": 8.1875, "kl": 0.0007134604675229639, "learning_rate": 2.972222222222222e-07, "loss": 0.0, "num_tokens": 7060359.0, "reward": 0.28891313076019287, "reward_std": 0.12801918387413025, "rewards/grpo_reward_func/mean": 0.28891313076019287, "rewards/grpo_reward_func/std": 0.13185609877109528, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.390625, "frac_reward_zero_std": 0.0, "grad_norm": 9.25, "kl": 0.0006706975400447845, "learning_rate": 2.962962962962963e-07, "loss": 0.0, "num_tokens": 7085407.0, "reward": 0.27223989367485046, "reward_std": 0.07281184196472168, "rewards/grpo_reward_func/mean": 0.27223989367485046, "rewards/grpo_reward_func/std": 0.07228488475084305, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.40625, "frac_reward_zero_std": 0.0, "grad_norm": 15.6875, "kl": 0.0007390764949377626, "learning_rate": 2.953703703703704e-07, "loss": 0.0, "num_tokens": 7110727.0, "reward": 0.3155236542224884, "reward_std": 0.08076095581054688, "rewards/grpo_reward_func/mean": 0.3155236542224884, "rewards/grpo_reward_func/std": 0.1124795451760292, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.421875, "frac_reward_zero_std": 0.0, "grad_norm": 11.5625, "kl": 0.0005147006886545569, "learning_rate": 2.9444444444444444e-07, "loss": 0.0, "num_tokens": 7135567.0, "reward": 0.3611481189727783, "reward_std": 0.14192907512187958, "rewards/grpo_reward_func/mean": 0.3611481189727783, "rewards/grpo_reward_func/std": 0.17185887694358826, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.4375, "frac_reward_zero_std": 0.0, "grad_norm": 9.375, "kl": 0.0007455420272890478, "learning_rate": 2.935185185185185e-07, "loss": 0.0, "num_tokens": 7160631.0, "reward": 0.2531818449497223, "reward_std": 0.11840977519750595, "rewards/grpo_reward_func/mean": 0.2531818449497223, "rewards/grpo_reward_func/std": 0.14898955821990967, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.453125, "frac_reward_zero_std": 0.0, "grad_norm": 8.9375, "kl": 0.0006515182030852884, "learning_rate": 2.9259259259259254e-07, "loss": 0.0, "num_tokens": 7185359.0, "reward": 0.4088771343231201, "reward_std": 0.09822411835193634, "rewards/grpo_reward_func/mean": 0.4088771343231201, "rewards/grpo_reward_func/std": 0.12453342974185944, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.46875, "frac_reward_zero_std": 0.0, "grad_norm": 11.625, "kl": 0.0017640814476180822, "learning_rate": 2.916666666666667e-07, "loss": 0.0001, "num_tokens": 7211087.0, "reward": 0.21529509127140045, "reward_std": 0.14051379263401031, "rewards/grpo_reward_func/mean": 0.21529509127140045, "rewards/grpo_reward_func/std": 0.22462895512580872, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.484375, "frac_reward_zero_std": 0.0, "grad_norm": 14.0625, "kl": 0.0016883965581655502, "learning_rate": 2.9074074074074075e-07, "loss": 0.0001, "num_tokens": 7236343.0, "reward": 0.19085359573364258, "reward_std": 0.07424027472734451, "rewards/grpo_reward_func/mean": 0.19085359573364258, "rewards/grpo_reward_func/std": 0.14520986378192902, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.5, "frac_reward_zero_std": 0.0, "grad_norm": 12.6875, "kl": 0.0005011484026908875, "learning_rate": 2.898148148148148e-07, "loss": 0.0, "num_tokens": 7261687.0, "reward": 0.23781077563762665, "reward_std": 0.1257193684577942, "rewards/grpo_reward_func/mean": 0.23781077563762665, "rewards/grpo_reward_func/std": 0.2135416567325592, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.515625, "frac_reward_zero_std": 0.0, "grad_norm": 12.1875, "kl": 0.001285669393837452, "learning_rate": 2.8888888888888885e-07, "loss": 0.0001, "num_tokens": 7286671.0, "reward": 0.42376312613487244, "reward_std": 0.1314304769039154, "rewards/grpo_reward_func/mean": 0.42376312613487244, "rewards/grpo_reward_func/std": 0.13118663430213928, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.53125, "frac_reward_zero_std": 0.0, "grad_norm": 11.9375, "kl": 0.0010273307852912694, "learning_rate": 2.8796296296296295e-07, "loss": 0.0, "num_tokens": 7312031.0, "reward": 0.2674727439880371, "reward_std": 0.10928401350975037, "rewards/grpo_reward_func/mean": 0.2674727439880371, "rewards/grpo_reward_func/std": 0.2467528134584427, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.546875, "frac_reward_zero_std": 0.0, "grad_norm": 10.375, "kl": 0.0015598470345139503, "learning_rate": 2.87037037037037e-07, "loss": 0.0001, "num_tokens": 7337039.0, "reward": 0.355027437210083, "reward_std": 0.1009407714009285, "rewards/grpo_reward_func/mean": 0.355027437210083, "rewards/grpo_reward_func/std": 0.10701252520084381, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.5625, "frac_reward_zero_std": 0.0, "grad_norm": 8.8125, "kl": 0.0006988458335399628, "learning_rate": 2.861111111111111e-07, "loss": 0.0, "num_tokens": 7362615.0, "reward": 0.3158547878265381, "reward_std": 0.10899890214204788, "rewards/grpo_reward_func/mean": 0.3158547878265381, "rewards/grpo_reward_func/std": 0.24725650250911713, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.578125, "frac_reward_zero_std": 0.0, "grad_norm": 12.0, "kl": 0.00035879015194950625, "learning_rate": 2.851851851851852e-07, "loss": 0.0, "num_tokens": 7387551.0, "reward": 0.4256178140640259, "reward_std": 0.11733870208263397, "rewards/grpo_reward_func/mean": 0.4256178140640259, "rewards/grpo_reward_func/std": 0.11450333893299103, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.59375, "frac_reward_zero_std": 0.0, "grad_norm": 10.8125, "kl": 0.0016969367861747742, "learning_rate": 2.8425925925925925e-07, "loss": 0.0001, "num_tokens": 7412687.0, "reward": 0.29177987575531006, "reward_std": 0.09220882505178452, "rewards/grpo_reward_func/mean": 0.29177987575531006, "rewards/grpo_reward_func/std": 0.2229277640581131, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.609375, "frac_reward_zero_std": 0.0, "grad_norm": 10.75, "kl": 0.0008428776636719704, "learning_rate": 2.833333333333333e-07, "loss": 0.0, "num_tokens": 7438223.0, "reward": 0.24013623595237732, "reward_std": 0.07828111946582794, "rewards/grpo_reward_func/mean": 0.24013623595237732, "rewards/grpo_reward_func/std": 0.20988282561302185, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.625, "frac_reward_zero_std": 0.0, "grad_norm": 12.8125, "kl": 0.0015230309218168259, "learning_rate": 2.8240740740740735e-07, "loss": 0.0001, "num_tokens": 7463023.0, "reward": 0.2694811224937439, "reward_std": 0.1517883688211441, "rewards/grpo_reward_func/mean": 0.2694811224937439, "rewards/grpo_reward_func/std": 0.1755567044019699, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.640625, "frac_reward_zero_std": 0.0, "grad_norm": 8.5, "kl": 0.0009139720350503922, "learning_rate": 2.814814814814815e-07, "loss": 0.0, "num_tokens": 7488479.0, "reward": 0.21519726514816284, "reward_std": 0.12513023614883423, "rewards/grpo_reward_func/mean": 0.21519726514816284, "rewards/grpo_reward_func/std": 0.15375806391239166, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.65625, "frac_reward_zero_std": 0.0, "grad_norm": 7.28125, "kl": 0.00033611089747864753, "learning_rate": 2.8055555555555556e-07, "loss": 0.0, "num_tokens": 7513439.0, "reward": 0.29541122913360596, "reward_std": 0.06453363597393036, "rewards/grpo_reward_func/mean": 0.29541122913360596, "rewards/grpo_reward_func/std": 0.07813195884227753, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.671875, "frac_reward_zero_std": 0.0, "grad_norm": 9.1875, "kl": 0.001329958438873291, "learning_rate": 2.796296296296296e-07, "loss": 0.0001, "num_tokens": 7538879.0, "reward": 0.19282189011573792, "reward_std": 0.15989510715007782, "rewards/grpo_reward_func/mean": 0.19282189011573792, "rewards/grpo_reward_func/std": 0.1697288304567337, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.6875, "frac_reward_zero_std": 0.0, "grad_norm": 15.5, "kl": 0.0011378759809304029, "learning_rate": 2.787037037037037e-07, "loss": 0.0, "num_tokens": 7563959.0, "reward": 0.4194362163543701, "reward_std": 0.1440151333808899, "rewards/grpo_reward_func/mean": 0.4194362163543701, "rewards/grpo_reward_func/std": 0.23242692649364471, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.703125, "frac_reward_zero_std": 0.0, "grad_norm": 9.8125, "kl": 0.0009807453607209027, "learning_rate": 2.7777777777777776e-07, "loss": 0.0, "num_tokens": 7589055.0, "reward": 0.13554009795188904, "reward_std": 0.06755845993757248, "rewards/grpo_reward_func/mean": 0.13554009795188904, "rewards/grpo_reward_func/std": 0.18186631798744202, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.71875, "frac_reward_zero_std": 0.0, "grad_norm": 8.3125, "kl": 0.0010332918318454176, "learning_rate": 2.7685185185185186e-07, "loss": 0.0, "num_tokens": 7614335.0, "reward": 0.24098092317581177, "reward_std": 0.0867777168750763, "rewards/grpo_reward_func/mean": 0.24098092317581177, "rewards/grpo_reward_func/std": 0.19634543359279633, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.734375, "frac_reward_zero_std": 0.0, "grad_norm": 11.375, "kl": 0.0010067789407912642, "learning_rate": 2.759259259259259e-07, "loss": 0.0, "num_tokens": 7639447.0, "reward": 0.2790209650993347, "reward_std": 0.11601302027702332, "rewards/grpo_reward_func/mean": 0.2790209650993347, "rewards/grpo_reward_func/std": 0.11709357798099518, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.75, "frac_reward_zero_std": 0.0, "grad_norm": 11.4375, "kl": 0.0010244656878057867, "learning_rate": 2.75e-07, "loss": 0.0, "num_tokens": 7664423.0, "reward": 0.400749534368515, "reward_std": 0.08898760378360748, "rewards/grpo_reward_func/mean": 0.400749534368515, "rewards/grpo_reward_func/std": 0.19838882982730865, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.765625, "frac_reward_zero_std": 0.0, "grad_norm": 12.625, "kl": 0.001470650837291032, "learning_rate": 2.7407407407407406e-07, "loss": 0.0001, "num_tokens": 7689567.0, "reward": 0.2512458562850952, "reward_std": 0.12142281234264374, "rewards/grpo_reward_func/mean": 0.2512458562850952, "rewards/grpo_reward_func/std": 0.1289859265089035, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.78125, "frac_reward_zero_std": 0.0, "grad_norm": 8.875, "kl": 0.0010143139807041734, "learning_rate": 2.731481481481481e-07, "loss": 0.0, "num_tokens": 7715719.0, "reward": 0.07891548424959183, "reward_std": 0.07104109227657318, "rewards/grpo_reward_func/mean": 0.07891548424959183, "rewards/grpo_reward_func/std": 0.07307452708482742, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.796875, "frac_reward_zero_std": 0.0, "grad_norm": 16.625, "kl": 0.001021136820781976, "learning_rate": 2.7222222222222216e-07, "loss": 0.0, "num_tokens": 7741599.0, "reward": 0.08877018094062805, "reward_std": 0.10754196345806122, "rewards/grpo_reward_func/mean": 0.08877018094062805, "rewards/grpo_reward_func/std": 0.11736486107110977, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.8125, "frac_reward_zero_std": 0.0, "grad_norm": 13.875, "kl": 0.0023402251536026597, "learning_rate": 2.712962962962963e-07, "loss": 0.0001, "num_tokens": 7766439.0, "reward": 0.3745066523551941, "reward_std": 0.10392110794782639, "rewards/grpo_reward_func/mean": 0.3745066523551941, "rewards/grpo_reward_func/std": 0.1060907319188118, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.828125, "frac_reward_zero_std": 0.0, "grad_norm": 8.375, "kl": 0.0014679792220704257, "learning_rate": 2.7037037037037037e-07, "loss": 0.0001, "num_tokens": 7792415.0, "reward": 0.05516662448644638, "reward_std": 0.1237877607345581, "rewards/grpo_reward_func/mean": 0.05516662448644638, "rewards/grpo_reward_func/std": 0.14112551510334015, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.84375, "frac_reward_zero_std": 0.0, "grad_norm": 11.5, "kl": 0.0011287930537946522, "learning_rate": 2.694444444444444e-07, "loss": 0.0, "num_tokens": 7817583.0, "reward": 0.2898673713207245, "reward_std": 0.14550068974494934, "rewards/grpo_reward_func/mean": 0.2898673713207245, "rewards/grpo_reward_func/std": 0.16415542364120483, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.859375, "frac_reward_zero_std": 0.0, "grad_norm": 11.8125, "kl": 0.0006401048449333757, "learning_rate": 2.685185185185185e-07, "loss": 0.0, "num_tokens": 7842599.0, "reward": 0.3488427698612213, "reward_std": 0.17175181210041046, "rewards/grpo_reward_func/mean": 0.3488427698612213, "rewards/grpo_reward_func/std": 0.21280032396316528, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.875, "frac_reward_zero_std": 0.0, "grad_norm": 14.9375, "kl": 0.0005905106663703918, "learning_rate": 2.6759259259259257e-07, "loss": 0.0, "num_tokens": 7868007.0, "reward": 0.3986209034919739, "reward_std": 0.15302179753780365, "rewards/grpo_reward_func/mean": 0.3986209034919739, "rewards/grpo_reward_func/std": 0.22395354509353638, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.890625, "frac_reward_zero_std": 0.0, "grad_norm": 9.875, "kl": 0.0009062414173968136, "learning_rate": 2.6666666666666667e-07, "loss": 0.0, "num_tokens": 7893263.0, "reward": 0.224882572889328, "reward_std": 0.11343192309141159, "rewards/grpo_reward_func/mean": 0.224882572889328, "rewards/grpo_reward_func/std": 0.187125563621521, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.90625, "frac_reward_zero_std": 0.0, "grad_norm": 15.3125, "kl": 0.0009414016676601022, "learning_rate": 2.657407407407407e-07, "loss": 0.0, "num_tokens": 7918671.0, "reward": 0.24882347881793976, "reward_std": 0.135990172624588, "rewards/grpo_reward_func/mean": 0.24882347881793976, "rewards/grpo_reward_func/std": 0.24125628173351288, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.921875, "frac_reward_zero_std": 0.0, "grad_norm": 10.1875, "kl": 0.0013116998597979546, "learning_rate": 2.648148148148148e-07, "loss": 0.0001, "num_tokens": 7944103.0, "reward": 0.2479252815246582, "reward_std": 0.14946752786636353, "rewards/grpo_reward_func/mean": 0.2479252815246582, "rewards/grpo_reward_func/std": 0.2719341218471527, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.9375, "frac_reward_zero_std": 0.0, "grad_norm": 11.125, "kl": 0.0005858497024746612, "learning_rate": 2.638888888888889e-07, "loss": 0.0, "num_tokens": 7969271.0, "reward": 0.41760867834091187, "reward_std": 0.1600833237171173, "rewards/grpo_reward_func/mean": 0.41760867834091187, "rewards/grpo_reward_func/std": 0.19628752768039703, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.953125, "frac_reward_zero_std": 0.0, "grad_norm": 8.875, "kl": 0.0008220685122068971, "learning_rate": 2.629629629629629e-07, "loss": 0.0, "num_tokens": 7993999.0, "reward": 0.35043102502822876, "reward_std": 0.09396857023239136, "rewards/grpo_reward_func/mean": 0.35043102502822876, "rewards/grpo_reward_func/std": 0.1028953343629837, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.96875, "frac_reward_zero_std": 0.0, "grad_norm": 9.125, "kl": 0.0007283634913619608, "learning_rate": 2.62037037037037e-07, "loss": 0.0, "num_tokens": 8019175.0, "reward": 0.4264575242996216, "reward_std": 0.07613471150398254, "rewards/grpo_reward_func/mean": 0.4264575242996216, "rewards/grpo_reward_func/std": 0.23587268590927124, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.984375, "frac_reward_zero_std": 0.0, "grad_norm": 14.3125, "kl": 0.001260551915038377, "learning_rate": 2.6111111111111113e-07, "loss": 0.0001, "num_tokens": 8044359.0, "reward": 0.3039223551750183, "reward_std": 0.1887253224849701, "rewards/grpo_reward_func/mean": 0.3039223551750183, "rewards/grpo_reward_func/std": 0.24560463428497314, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.0, "frac_reward_zero_std": 0.0, "grad_norm": 9.0625, "kl": 0.0006503397598862648, "learning_rate": 2.601851851851852e-07, "loss": 0.0, "num_tokens": 8069079.0, "reward": 0.47094419598579407, "reward_std": 0.1439197063446045, "rewards/grpo_reward_func/mean": 0.47094419598579407, "rewards/grpo_reward_func/std": 0.1566508263349533, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.015625, "frac_reward_zero_std": 0.0, "grad_norm": 7.5, "kl": 0.0005181074739084579, "learning_rate": 2.5925925925925923e-07, "loss": 0.0, "num_tokens": 8094695.0, "reward": 0.21605555713176727, "reward_std": 0.04700346663594246, "rewards/grpo_reward_func/mean": 0.21605555713176727, "rewards/grpo_reward_func/std": 0.16210661828517914, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.03125, "frac_reward_zero_std": 0.0, "grad_norm": 9.75, "kl": 0.000648934073979035, "learning_rate": 2.5833333333333333e-07, "loss": 0.0, "num_tokens": 8119727.0, "reward": 0.38052335381507874, "reward_std": 0.15309563279151917, "rewards/grpo_reward_func/mean": 0.38052335381507874, "rewards/grpo_reward_func/std": 0.1593683809041977, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.046875, "frac_reward_zero_std": 0.0, "grad_norm": 7.6875, "kl": 0.0004919447528664023, "learning_rate": 2.574074074074074e-07, "loss": 0.0, "num_tokens": 8144823.0, "reward": 0.2545957565307617, "reward_std": 0.09698673337697983, "rewards/grpo_reward_func/mean": 0.2545957565307617, "rewards/grpo_reward_func/std": 0.14691407978534698, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.0625, "frac_reward_zero_std": 0.0, "grad_norm": 9.0, "kl": 0.0005518794059753418, "learning_rate": 2.564814814814815e-07, "loss": 0.0, "num_tokens": 8170231.0, "reward": 0.2633800804615021, "reward_std": 0.1403658092021942, "rewards/grpo_reward_func/mean": 0.2633800804615021, "rewards/grpo_reward_func/std": 0.20393338799476624, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.078125, "frac_reward_zero_std": 0.0, "grad_norm": 15.0625, "kl": 0.0013933554291725159, "learning_rate": 2.5555555555555553e-07, "loss": 0.0001, "num_tokens": 8195255.0, "reward": 0.4053245782852173, "reward_std": 0.13726326823234558, "rewards/grpo_reward_func/mean": 0.4053245782852173, "rewards/grpo_reward_func/std": 0.23320239782333374, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.09375, "frac_reward_zero_std": 0.0, "grad_norm": 25.625, "kl": 0.002304654335603118, "learning_rate": 2.5462962962962963e-07, "loss": 0.0001, "num_tokens": 8220567.0, "reward": 0.28461790084838867, "reward_std": 0.2090614289045334, "rewards/grpo_reward_func/mean": 0.28461790084838867, "rewards/grpo_reward_func/std": 0.253989040851593, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.109375, "frac_reward_zero_std": 0.0, "grad_norm": 13.5, "kl": 0.0005805188266094774, "learning_rate": 2.537037037037037e-07, "loss": 0.0, "num_tokens": 8245655.0, "reward": 0.281326562166214, "reward_std": 0.10936335474252701, "rewards/grpo_reward_func/mean": 0.281326562166214, "rewards/grpo_reward_func/std": 0.13177289068698883, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.125, "frac_reward_zero_std": 0.0, "grad_norm": 12.625, "kl": 0.0003259473742218688, "learning_rate": 2.5277777777777773e-07, "loss": 0.0, "num_tokens": 8270591.0, "reward": 0.38816237449645996, "reward_std": 0.13367968797683716, "rewards/grpo_reward_func/mean": 0.38816237449645996, "rewards/grpo_reward_func/std": 0.17273396253585815, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.140625, "frac_reward_zero_std": 0.0, "grad_norm": 12.375, "kl": 0.0018250771681778133, "learning_rate": 2.5185185185185184e-07, "loss": 0.0001, "num_tokens": 8295767.0, "reward": 0.36079350113868713, "reward_std": 0.11437784135341644, "rewards/grpo_reward_func/mean": 0.36079350113868713, "rewards/grpo_reward_func/std": 0.21398717164993286, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.15625, "frac_reward_zero_std": 0.0, "grad_norm": 9.3125, "kl": 0.002108390093781054, "learning_rate": 2.5092592592592594e-07, "loss": 0.0001, "num_tokens": 8320431.0, "reward": 0.3810342848300934, "reward_std": 0.08449837565422058, "rewards/grpo_reward_func/mean": 0.3810342848300934, "rewards/grpo_reward_func/std": 0.0880691260099411, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.171875, "frac_reward_zero_std": 0.0, "grad_norm": 13.5625, "kl": 0.0014629397774115205, "learning_rate": 2.5e-07, "loss": 0.0001, "num_tokens": 8345951.0, "reward": 0.2561107873916626, "reward_std": 0.10452878475189209, "rewards/grpo_reward_func/mean": 0.2561107873916626, "rewards/grpo_reward_func/std": 0.13631132245063782, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.1875, "frac_reward_zero_std": 0.0, "grad_norm": 13.6875, "kl": 0.0007190862525021657, "learning_rate": 2.490740740740741e-07, "loss": 0.0, "num_tokens": 8371799.0, "reward": 0.05727135390043259, "reward_std": 0.08457481861114502, "rewards/grpo_reward_func/mean": 0.05727135390043259, "rewards/grpo_reward_func/std": 0.08320802450180054, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.203125, "frac_reward_zero_std": 0.0, "grad_norm": 15.3125, "kl": 0.0010623404232319444, "learning_rate": 2.4814814814814814e-07, "loss": 0.0, "num_tokens": 8396655.0, "reward": 0.47699636220932007, "reward_std": 0.1573687195777893, "rewards/grpo_reward_func/mean": 0.47699636220932007, "rewards/grpo_reward_func/std": 0.16918563842773438, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.21875, "frac_reward_zero_std": 0.0, "grad_norm": 9.1875, "kl": 0.0008317362517118454, "learning_rate": 2.4722222222222224e-07, "loss": 0.0, "num_tokens": 8422231.0, "reward": 0.2612965404987335, "reward_std": 0.10427425801753998, "rewards/grpo_reward_func/mean": 0.2612965404987335, "rewards/grpo_reward_func/std": 0.25590068101882935, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.234375, "frac_reward_zero_std": 0.0, "grad_norm": 12.0625, "kl": 0.0012046831543557346, "learning_rate": 2.462962962962963e-07, "loss": 0.0, "num_tokens": 8447175.0, "reward": 0.4008851647377014, "reward_std": 0.11680196225643158, "rewards/grpo_reward_func/mean": 0.4008851647377014, "rewards/grpo_reward_func/std": 0.13369765877723694, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.25, "frac_reward_zero_std": 0.0, "grad_norm": 16.0, "kl": 0.0006340428517432883, "learning_rate": 2.4537037037037034e-07, "loss": 0.0, "num_tokens": 8472255.0, "reward": 0.26478323340415955, "reward_std": 0.16724863648414612, "rewards/grpo_reward_func/mean": 0.26478323340415955, "rewards/grpo_reward_func/std": 0.1973247230052948, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.265625, "frac_reward_zero_std": 0.0, "grad_norm": 14.75, "kl": 0.0011970326595474035, "learning_rate": 2.4444444444444445e-07, "loss": 0.0, "num_tokens": 8497263.0, "reward": 0.374372661113739, "reward_std": 0.08080107718706131, "rewards/grpo_reward_func/mean": 0.374372661113739, "rewards/grpo_reward_func/std": 0.08129201829433441, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.28125, "frac_reward_zero_std": 0.0, "grad_norm": 11.75, "kl": 0.0009607278334442526, "learning_rate": 2.435185185185185e-07, "loss": 0.0, "num_tokens": 8522511.0, "reward": 0.24800701439380646, "reward_std": 0.12350637465715408, "rewards/grpo_reward_func/mean": 0.24800701439380646, "rewards/grpo_reward_func/std": 0.18218368291854858, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.296875, "frac_reward_zero_std": 0.0, "grad_norm": 6.9375, "kl": 0.0007166365685407072, "learning_rate": 2.425925925925926e-07, "loss": 0.0, "num_tokens": 8547223.0, "reward": 0.388899564743042, "reward_std": 0.06452769041061401, "rewards/grpo_reward_func/mean": 0.388899564743042, "rewards/grpo_reward_func/std": 0.08408286422491074, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.3125, "frac_reward_zero_std": 0.0, "grad_norm": 14.6875, "kl": 0.0006827044708188623, "learning_rate": 2.4166666666666665e-07, "loss": 0.0, "num_tokens": 8572143.0, "reward": 0.3447013795375824, "reward_std": 0.10072646290063858, "rewards/grpo_reward_func/mean": 0.3447013795375824, "rewards/grpo_reward_func/std": 0.1320715993642807, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.328125, "frac_reward_zero_std": 0.0, "grad_norm": 10.875, "kl": 0.0012708511494565755, "learning_rate": 2.407407407407407e-07, "loss": 0.0001, "num_tokens": 8597503.0, "reward": 0.33516252040863037, "reward_std": 0.2073679268360138, "rewards/grpo_reward_func/mean": 0.33516252040863037, "rewards/grpo_reward_func/std": 0.23557628691196442, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.34375, "frac_reward_zero_std": 0.0, "grad_norm": 11.1875, "kl": 0.0013067865220364183, "learning_rate": 2.398148148148148e-07, "loss": 0.0001, "num_tokens": 8623423.0, "reward": 0.06080477684736252, "reward_std": 0.08706031739711761, "rewards/grpo_reward_func/mean": 0.06080477684736252, "rewards/grpo_reward_func/std": 0.08990643173456192, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.359375, "frac_reward_zero_std": 0.0, "grad_norm": 9.1875, "kl": 0.000486970558995381, "learning_rate": 2.388888888888889e-07, "loss": 0.0, "num_tokens": 8648583.0, "reward": 0.4393516182899475, "reward_std": 0.13304097950458527, "rewards/grpo_reward_func/mean": 0.4393516182899475, "rewards/grpo_reward_func/std": 0.13269037008285522, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.375, "frac_reward_zero_std": 0.0, "grad_norm": 7.9375, "kl": 0.0003986669034929946, "learning_rate": 2.3796296296296295e-07, "loss": 0.0, "num_tokens": 8674007.0, "reward": 0.1185535416007042, "reward_std": 0.04521109163761139, "rewards/grpo_reward_func/mean": 0.1185535416007042, "rewards/grpo_reward_func/std": 0.10313326120376587, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.390625, "frac_reward_zero_std": 0.0, "grad_norm": 9.9375, "kl": 0.0008137710246955976, "learning_rate": 2.3703703703703703e-07, "loss": 0.0, "num_tokens": 8699695.0, "reward": 0.22900578379631042, "reward_std": 0.1365535855293274, "rewards/grpo_reward_func/mean": 0.22900578379631042, "rewards/grpo_reward_func/std": 0.18792560696601868, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.40625, "frac_reward_zero_std": 0.0, "grad_norm": 13.0, "kl": 0.0009939819865394384, "learning_rate": 2.361111111111111e-07, "loss": 0.0, "num_tokens": 8724831.0, "reward": 0.29041871428489685, "reward_std": 0.08149899542331696, "rewards/grpo_reward_func/mean": 0.29041871428489685, "rewards/grpo_reward_func/std": 0.219549298286438, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.421875, "frac_reward_zero_std": 0.0, "grad_norm": 9.875, "kl": 0.0010270678030792624, "learning_rate": 2.3518518518518518e-07, "loss": 0.0, "num_tokens": 8749951.0, "reward": 0.19644547998905182, "reward_std": 0.11144804954528809, "rewards/grpo_reward_func/mean": 0.19644547998905182, "rewards/grpo_reward_func/std": 0.15927433967590332, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.4375, "frac_reward_zero_std": 0.0, "grad_norm": 14.4375, "kl": 0.0012654773890972137, "learning_rate": 2.3425925925925923e-07, "loss": 0.0001, "num_tokens": 8774775.0, "reward": 0.2843724489212036, "reward_std": 0.099556565284729, "rewards/grpo_reward_func/mean": 0.2843724489212036, "rewards/grpo_reward_func/std": 0.10954099893569946, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.453125, "frac_reward_zero_std": 0.0, "grad_norm": 12.3125, "kl": 0.0010531196894589812, "learning_rate": 2.3333333333333333e-07, "loss": 0.0, "num_tokens": 8799855.0, "reward": 0.37109872698783875, "reward_std": 0.09316375851631165, "rewards/grpo_reward_func/mean": 0.37109872698783875, "rewards/grpo_reward_func/std": 0.0997348204255104, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.46875, "frac_reward_zero_std": 0.0, "grad_norm": 14.0, "kl": 0.001134138583438471, "learning_rate": 2.3240740740740738e-07, "loss": 0.0, "num_tokens": 8825103.0, "reward": 0.2134397327899933, "reward_std": 0.09246792644262314, "rewards/grpo_reward_func/mean": 0.2134397327899933, "rewards/grpo_reward_func/std": 0.15889793634414673, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.484375, "frac_reward_zero_std": 0.0, "grad_norm": 13.4375, "kl": 0.0015808992902748287, "learning_rate": 2.3148148148148148e-07, "loss": 0.0001, "num_tokens": 8849727.0, "reward": 0.36531519889831543, "reward_std": 0.14355136454105377, "rewards/grpo_reward_func/mean": 0.36531519889831543, "rewards/grpo_reward_func/std": 0.1389627456665039, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.5, "frac_reward_zero_std": 0.0, "grad_norm": 15.1875, "kl": 0.0006994760187808424, "learning_rate": 2.3055555555555556e-07, "loss": 0.0, "num_tokens": 8874639.0, "reward": 0.40682148933410645, "reward_std": 0.13052034378051758, "rewards/grpo_reward_func/mean": 0.40682148933410645, "rewards/grpo_reward_func/std": 0.1289222538471222, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.515625, "frac_reward_zero_std": 0.0, "grad_norm": 10.25, "kl": 0.0006646249967161566, "learning_rate": 2.296296296296296e-07, "loss": 0.0, "num_tokens": 8899863.0, "reward": 0.2974233031272888, "reward_std": 0.15527892112731934, "rewards/grpo_reward_func/mean": 0.2974233031272888, "rewards/grpo_reward_func/std": 0.19951732456684113, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.53125, "frac_reward_zero_std": 0.0, "grad_norm": 8.625, "kl": 0.0009673306194599718, "learning_rate": 2.287037037037037e-07, "loss": 0.0, "num_tokens": 8925991.0, "reward": 0.07634272426366806, "reward_std": 0.06661258637905121, "rewards/grpo_reward_func/mean": 0.07634272426366806, "rewards/grpo_reward_func/std": 0.07981257140636444, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.546875, "frac_reward_zero_std": 0.0, "grad_norm": 13.125, "kl": 0.0010449464025441557, "learning_rate": 2.2777777777777776e-07, "loss": 0.0, "num_tokens": 8951031.0, "reward": 0.37363359332084656, "reward_std": 0.11995186656713486, "rewards/grpo_reward_func/mean": 0.37363359332084656, "rewards/grpo_reward_func/std": 0.16280175745487213, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.5625, "frac_reward_zero_std": 0.0, "grad_norm": 10.1875, "kl": 0.0011851430463138968, "learning_rate": 2.2685185185185184e-07, "loss": 0.0, "num_tokens": 8976327.0, "reward": 0.29697471857070923, "reward_std": 0.16333544254302979, "rewards/grpo_reward_func/mean": 0.29697471857070923, "rewards/grpo_reward_func/std": 0.17323781549930573, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.578125, "frac_reward_zero_std": 0.0, "grad_norm": 11.25, "kl": 0.0011026623542420566, "learning_rate": 2.2592592592592591e-07, "loss": 0.0, "num_tokens": 9001479.0, "reward": 0.3700714707374573, "reward_std": 0.11212660372257233, "rewards/grpo_reward_func/mean": 0.3700714707374573, "rewards/grpo_reward_func/std": 0.17391134798526764, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.59375, "frac_reward_zero_std": 0.0, "grad_norm": 8.875, "kl": 0.0008961235289461911, "learning_rate": 2.25e-07, "loss": 0.0, "num_tokens": 9026831.0, "reward": 0.30335038900375366, "reward_std": 0.16283930838108063, "rewards/grpo_reward_func/mean": 0.30335038900375366, "rewards/grpo_reward_func/std": 0.21482953429222107, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.609375, "frac_reward_zero_std": 0.0, "grad_norm": 7.0625, "kl": 0.001187270536320284, "learning_rate": 2.2407407407407407e-07, "loss": 0.0, "num_tokens": 9052111.0, "reward": 0.2336607277393341, "reward_std": 0.07578499615192413, "rewards/grpo_reward_func/mean": 0.2336607277393341, "rewards/grpo_reward_func/std": 0.2178594022989273, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.625, "frac_reward_zero_std": 0.0, "grad_norm": 12.5625, "kl": 0.0016355722327716649, "learning_rate": 2.2314814814814814e-07, "loss": 0.0001, "num_tokens": 9077327.0, "reward": 0.24882102012634277, "reward_std": 0.10142374038696289, "rewards/grpo_reward_func/mean": 0.24882102012634277, "rewards/grpo_reward_func/std": 0.14036507904529572, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.640625, "frac_reward_zero_std": 0.0, "grad_norm": 14.5625, "kl": 0.0011905189749086276, "learning_rate": 2.222222222222222e-07, "loss": 0.0, "num_tokens": 9102855.0, "reward": 0.19088464975357056, "reward_std": 0.13671398162841797, "rewards/grpo_reward_func/mean": 0.19088464975357056, "rewards/grpo_reward_func/std": 0.19571226835250854, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.65625, "frac_reward_zero_std": 0.0, "grad_norm": 14.875, "kl": 0.0014347138931043446, "learning_rate": 2.212962962962963e-07, "loss": 0.0001, "num_tokens": 9127575.0, "reward": 0.4702339172363281, "reward_std": 0.13507477939128876, "rewards/grpo_reward_func/mean": 0.4702339172363281, "rewards/grpo_reward_func/std": 0.13804838061332703, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.671875, "frac_reward_zero_std": 0.0, "grad_norm": 7.25, "kl": 0.0011942110140807927, "learning_rate": 2.2037037037037037e-07, "loss": 0.0, "num_tokens": 9153375.0, "reward": 0.08592602610588074, "reward_std": 0.05403965711593628, "rewards/grpo_reward_func/mean": 0.08592602610588074, "rewards/grpo_reward_func/std": 0.05771300941705704, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.6875, "frac_reward_zero_std": 0.0, "grad_norm": 10.5625, "kl": 0.0017215957632288337, "learning_rate": 2.1944444444444442e-07, "loss": 0.0001, "num_tokens": 9178727.0, "reward": 0.3241480886936188, "reward_std": 0.09995287656784058, "rewards/grpo_reward_func/mean": 0.3241480886936188, "rewards/grpo_reward_func/std": 0.270202100276947, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.703125, "frac_reward_zero_std": 0.0, "grad_norm": 8.8125, "kl": 0.0006930825184099376, "learning_rate": 2.1851851851851852e-07, "loss": 0.0, "num_tokens": 9203919.0, "reward": 0.33298927545547485, "reward_std": 0.13145846128463745, "rewards/grpo_reward_func/mean": 0.33298927545547485, "rewards/grpo_reward_func/std": 0.1428404450416565, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.71875, "frac_reward_zero_std": 0.0, "grad_norm": 6.71875, "kl": 0.0006192661821842194, "learning_rate": 2.1759259259259257e-07, "loss": 0.0, "num_tokens": 9229119.0, "reward": 0.24505698680877686, "reward_std": 0.09642099589109421, "rewards/grpo_reward_func/mean": 0.24505698680877686, "rewards/grpo_reward_func/std": 0.098020538687706, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.734375, "frac_reward_zero_std": 0.0, "grad_norm": 9.5, "kl": 0.0006466656923294067, "learning_rate": 2.1666666666666667e-07, "loss": 0.0, "num_tokens": 9253975.0, "reward": 0.38370344042778015, "reward_std": 0.1567784547805786, "rewards/grpo_reward_func/mean": 0.38370344042778015, "rewards/grpo_reward_func/std": 0.15556758642196655, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.75, "frac_reward_zero_std": 0.0, "grad_norm": 11.4375, "kl": 0.0005825981497764587, "learning_rate": 2.1574074074074072e-07, "loss": 0.0, "num_tokens": 9279695.0, "reward": 0.11095234006643295, "reward_std": 0.08616747707128525, "rewards/grpo_reward_func/mean": 0.11095234006643295, "rewards/grpo_reward_func/std": 0.1468985229730606, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.765625, "frac_reward_zero_std": 0.0, "grad_norm": 14.375, "kl": 0.0006324804126052186, "learning_rate": 2.148148148148148e-07, "loss": 0.0, "num_tokens": 9304551.0, "reward": 0.3810734152793884, "reward_std": 0.14643797278404236, "rewards/grpo_reward_func/mean": 0.3810734152793884, "rewards/grpo_reward_func/std": 0.19883529841899872, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.78125, "frac_reward_zero_std": 0.0, "grad_norm": 8.1875, "kl": 0.0006064654589863494, "learning_rate": 2.1388888888888888e-07, "loss": 0.0, "num_tokens": 9330063.0, "reward": 0.20634137094020844, "reward_std": 0.1969866156578064, "rewards/grpo_reward_func/mean": 0.20634137094020844, "rewards/grpo_reward_func/std": 0.275967001914978, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.796875, "frac_reward_zero_std": 0.0, "grad_norm": 11.5625, "kl": 0.0008783147786743939, "learning_rate": 2.1296296296296295e-07, "loss": 0.0, "num_tokens": 9355063.0, "reward": 0.3551791310310364, "reward_std": 0.0490216389298439, "rewards/grpo_reward_func/mean": 0.3551791310310364, "rewards/grpo_reward_func/std": 0.05635032430291176, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.8125, "frac_reward_zero_std": 0.0, "grad_norm": 15.8125, "kl": 0.0009447563061257824, "learning_rate": 2.12037037037037e-07, "loss": 0.0, "num_tokens": 9380719.0, "reward": 0.13325411081314087, "reward_std": 0.11700575053691864, "rewards/grpo_reward_func/mean": 0.13325411081314087, "rewards/grpo_reward_func/std": 0.12200622260570526, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.828125, "frac_reward_zero_std": 0.0, "grad_norm": 10.6875, "kl": 0.0015358021191786975, "learning_rate": 2.111111111111111e-07, "loss": 0.0001, "num_tokens": 9406015.0, "reward": 0.1894848346710205, "reward_std": 0.08906535059213638, "rewards/grpo_reward_func/mean": 0.1894848346710205, "rewards/grpo_reward_func/std": 0.11156714707612991, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.84375, "frac_reward_zero_std": 0.0, "grad_norm": 8.0625, "kl": 0.001031855761539191, "learning_rate": 2.1018518518518518e-07, "loss": 0.0, "num_tokens": 9431111.0, "reward": 0.36022406816482544, "reward_std": 0.0762445256114006, "rewards/grpo_reward_func/mean": 0.36022406816482544, "rewards/grpo_reward_func/std": 0.08881448209285736, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.859375, "frac_reward_zero_std": 0.0, "grad_norm": 11.4375, "kl": 0.0006837841647211462, "learning_rate": 2.0925925925925926e-07, "loss": 0.0, "num_tokens": 9455967.0, "reward": 0.5004688501358032, "reward_std": 0.1380087286233902, "rewards/grpo_reward_func/mean": 0.5004688501358032, "rewards/grpo_reward_func/std": 0.17277836799621582, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.875, "frac_reward_zero_std": 0.0, "grad_norm": 12.5625, "kl": 0.00043654504406731576, "learning_rate": 2.0833333333333333e-07, "loss": 0.0, "num_tokens": 9480727.0, "reward": 0.3416731357574463, "reward_std": 0.08171491324901581, "rewards/grpo_reward_func/mean": 0.3416731357574463, "rewards/grpo_reward_func/std": 0.08727457374334335, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.890625, "frac_reward_zero_std": 0.0, "grad_norm": 8.125, "kl": 0.0005711590347345918, "learning_rate": 2.0740740740740738e-07, "loss": 0.0, "num_tokens": 9506119.0, "reward": 0.2576856315135956, "reward_std": 0.07897262275218964, "rewards/grpo_reward_func/mean": 0.2576856315135956, "rewards/grpo_reward_func/std": 0.1244540736079216, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.90625, "frac_reward_zero_std": 0.0, "grad_norm": 11.875, "kl": 0.0010556740162428468, "learning_rate": 2.0648148148148148e-07, "loss": 0.0, "num_tokens": 9531303.0, "reward": 0.34753307700157166, "reward_std": 0.18784627318382263, "rewards/grpo_reward_func/mean": 0.34753307700157166, "rewards/grpo_reward_func/std": 0.25699713826179504, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.921875, "frac_reward_zero_std": 0.0, "grad_norm": 10.125, "kl": 0.001215081021655351, "learning_rate": 2.0555555555555553e-07, "loss": 0.0, "num_tokens": 9556359.0, "reward": 0.3578820824623108, "reward_std": 0.16381356120109558, "rewards/grpo_reward_func/mean": 0.3578820824623108, "rewards/grpo_reward_func/std": 0.1917094886302948, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.9375, "frac_reward_zero_std": 0.0, "grad_norm": 12.625, "kl": 0.000709039144567214, "learning_rate": 2.0462962962962964e-07, "loss": 0.0, "num_tokens": 9581671.0, "reward": 0.23046386241912842, "reward_std": 0.15167269110679626, "rewards/grpo_reward_func/mean": 0.23046386241912842, "rewards/grpo_reward_func/std": 0.17622780799865723, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.953125, "frac_reward_zero_std": 0.0, "grad_norm": 10.1875, "kl": 0.0006106219661887735, "learning_rate": 2.0370370370370369e-07, "loss": 0.0, "num_tokens": 9606903.0, "reward": 0.2709818184375763, "reward_std": 0.12175773829221725, "rewards/grpo_reward_func/mean": 0.2709818184375763, "rewards/grpo_reward_func/std": 0.2393583059310913, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.96875, "frac_reward_zero_std": 0.0, "grad_norm": 10.25, "kl": 0.0006747972074663267, "learning_rate": 2.0277777777777776e-07, "loss": 0.0, "num_tokens": 9632351.0, "reward": 0.1715637445449829, "reward_std": 0.07509627938270569, "rewards/grpo_reward_func/mean": 0.1715637445449829, "rewards/grpo_reward_func/std": 0.12089093774557114, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.984375, "frac_reward_zero_std": 0.0, "grad_norm": 10.375, "kl": 0.0003686649724841118, "learning_rate": 2.0185185185185187e-07, "loss": 0.0, "num_tokens": 9657423.0, "reward": 0.25206711888313293, "reward_std": 0.11033091694116592, "rewards/grpo_reward_func/mean": 0.25206711888313293, "rewards/grpo_reward_func/std": 0.1744239181280136, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.0, "frac_reward_zero_std": 0.0, "grad_norm": 11.3125, "kl": 0.0015397807583212852, "learning_rate": 2.0092592592592591e-07, "loss": 0.0001, "num_tokens": 9682895.0, "reward": 0.22812840342521667, "reward_std": 0.16441597044467926, "rewards/grpo_reward_func/mean": 0.22812840342521667, "rewards/grpo_reward_func/std": 0.2453615814447403, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.015625, "frac_reward_zero_std": 0.0, "grad_norm": 14.625, "kl": 0.0013099961797706783, "learning_rate": 2e-07, "loss": 0.0001, "num_tokens": 9707727.0, "reward": 0.4902651309967041, "reward_std": 0.12934622168540955, "rewards/grpo_reward_func/mean": 0.4902651309967041, "rewards/grpo_reward_func/std": 0.12886442244052887, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.03125, "frac_reward_zero_std": 0.0, "grad_norm": 7.53125, "kl": 0.0006103202176745981, "learning_rate": 1.9907407407407407e-07, "loss": 0.0, "num_tokens": 9732983.0, "reward": 0.20654284954071045, "reward_std": 0.07593633234500885, "rewards/grpo_reward_func/mean": 0.20654284954071045, "rewards/grpo_reward_func/std": 0.11230036616325378, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.046875, "frac_reward_zero_std": 0.0, "grad_norm": 7.59375, "kl": 0.0013176609645597637, "learning_rate": 1.9814814814814814e-07, "loss": 0.0001, "num_tokens": 9758559.0, "reward": 0.2913218140602112, "reward_std": 0.07996957749128342, "rewards/grpo_reward_func/mean": 0.2913218140602112, "rewards/grpo_reward_func/std": 0.20861035585403442, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.0625, "frac_reward_zero_std": 0.0, "grad_norm": 11.9375, "kl": 0.00296200011507608, "learning_rate": 1.9722222222222222e-07, "loss": 0.0001, "num_tokens": 9784007.0, "reward": 0.3508840501308441, "reward_std": 0.15029387176036835, "rewards/grpo_reward_func/mean": 0.3508840501308441, "rewards/grpo_reward_func/std": 0.1881677657365799, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.078125, "frac_reward_zero_std": 0.0, "grad_norm": 8.375, "kl": 0.0007045343518257141, "learning_rate": 1.962962962962963e-07, "loss": 0.0, "num_tokens": 9809183.0, "reward": 0.38261544704437256, "reward_std": 0.13104796409606934, "rewards/grpo_reward_func/mean": 0.38261544704437256, "rewards/grpo_reward_func/std": 0.16021603345870972, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.09375, "frac_reward_zero_std": 0.0, "grad_norm": 8.5625, "kl": 0.0007661972194910049, "learning_rate": 1.9537037037037034e-07, "loss": 0.0, "num_tokens": 9834375.0, "reward": 0.2729978561401367, "reward_std": 0.14411945641040802, "rewards/grpo_reward_func/mean": 0.2729978561401367, "rewards/grpo_reward_func/std": 0.18588939309120178, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.109375, "frac_reward_zero_std": 0.0, "grad_norm": 8.375, "kl": 0.000638260506093502, "learning_rate": 1.9444444444444445e-07, "loss": 0.0, "num_tokens": 9860271.0, "reward": 0.14101102948188782, "reward_std": 0.07042165100574493, "rewards/grpo_reward_func/mean": 0.14101102948188782, "rewards/grpo_reward_func/std": 0.08415806293487549, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.125, "frac_reward_zero_std": 0.0, "grad_norm": 12.0625, "kl": 0.0011364755337126553, "learning_rate": 1.935185185185185e-07, "loss": 0.0, "num_tokens": 9884943.0, "reward": 0.4777190387248993, "reward_std": 0.09997393190860748, "rewards/grpo_reward_func/mean": 0.4777190387248993, "rewards/grpo_reward_func/std": 0.10503428429365158, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.140625, "frac_reward_zero_std": 0.0, "grad_norm": 10.875, "kl": 0.0010265009186696261, "learning_rate": 1.9259259259259257e-07, "loss": 0.0, "num_tokens": 9910519.0, "reward": 0.09008777141571045, "reward_std": 0.1275341957807541, "rewards/grpo_reward_func/mean": 0.09008777141571045, "rewards/grpo_reward_func/std": 0.14905866980552673, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.15625, "frac_reward_zero_std": 0.0, "grad_norm": 9.5625, "kl": 0.0010667061724234372, "learning_rate": 1.9166666666666668e-07, "loss": 0.0, "num_tokens": 9935623.0, "reward": 0.3963939845561981, "reward_std": 0.1269564926624298, "rewards/grpo_reward_func/mean": 0.3963939845561981, "rewards/grpo_reward_func/std": 0.12265895307064056, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.171875, "frac_reward_zero_std": 0.0, "grad_norm": 9.8125, "kl": 0.0010338767024222761, "learning_rate": 1.9074074074074073e-07, "loss": 0.0, "num_tokens": 9960807.0, "reward": 0.339819073677063, "reward_std": 0.131906658411026, "rewards/grpo_reward_func/mean": 0.339819073677063, "rewards/grpo_reward_func/std": 0.19305965304374695, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.1875, "frac_reward_zero_std": 0.0, "grad_norm": 9.625, "kl": 0.001020548545056954, "learning_rate": 1.8981481481481483e-07, "loss": 0.0, "num_tokens": 9986367.0, "reward": 0.15465489029884338, "reward_std": 0.1416236013174057, "rewards/grpo_reward_func/mean": 0.15465489029884338, "rewards/grpo_reward_func/std": 0.1588820368051529, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.203125, "frac_reward_zero_std": 0.0, "grad_norm": 11.6875, "kl": 0.0015650332206860185, "learning_rate": 1.8888888888888888e-07, "loss": 0.0001, "num_tokens": 10011895.0, "reward": 0.16186270117759705, "reward_std": 0.07497712969779968, "rewards/grpo_reward_func/mean": 0.16186270117759705, "rewards/grpo_reward_func/std": 0.16643059253692627, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.21875, "frac_reward_zero_std": 0.0, "grad_norm": 9.875, "kl": 0.0011266165529377759, "learning_rate": 1.8796296296296295e-07, "loss": 0.0, "num_tokens": 10036927.0, "reward": 0.33959537744522095, "reward_std": 0.14239588379859924, "rewards/grpo_reward_func/mean": 0.33959537744522095, "rewards/grpo_reward_func/std": 0.18363837897777557, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.234375, "frac_reward_zero_std": 0.0, "grad_norm": 10.25, "kl": 0.00058710016310215, "learning_rate": 1.8703703703703703e-07, "loss": 0.0, "num_tokens": 10062167.0, "reward": 0.18064472079277039, "reward_std": 0.10977937281131744, "rewards/grpo_reward_func/mean": 0.18064472079277039, "rewards/grpo_reward_func/std": 0.12083122134208679, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.25, "frac_reward_zero_std": 0.0, "grad_norm": 10.6875, "kl": 0.0005866416468052194, "learning_rate": 1.861111111111111e-07, "loss": 0.0, "num_tokens": 10088111.0, "reward": 0.1212317943572998, "reward_std": 0.11557292938232422, "rewards/grpo_reward_func/mean": 0.1212317943572998, "rewards/grpo_reward_func/std": 0.13769572973251343, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.265625, "frac_reward_zero_std": 0.0, "grad_norm": 11.1875, "kl": 0.0011352576548233628, "learning_rate": 1.8518518518518516e-07, "loss": 0.0, "num_tokens": 10113127.0, "reward": 0.3895754814147949, "reward_std": 0.13341936469078064, "rewards/grpo_reward_func/mean": 0.3895754814147949, "rewards/grpo_reward_func/std": 0.1501649022102356, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.28125, "frac_reward_zero_std": 0.0, "grad_norm": 7.28125, "kl": 0.0009206359682139009, "learning_rate": 1.8425925925925926e-07, "loss": 0.0, "num_tokens": 10138511.0, "reward": 0.29999056458473206, "reward_std": 0.08621784299612045, "rewards/grpo_reward_func/mean": 0.29999056458473206, "rewards/grpo_reward_func/std": 0.24252568185329437, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.296875, "frac_reward_zero_std": 0.0, "grad_norm": 12.0625, "kl": 0.0011214362166356295, "learning_rate": 1.833333333333333e-07, "loss": 0.0, "num_tokens": 10164063.0, "reward": 0.25143373012542725, "reward_std": 0.09383880347013474, "rewards/grpo_reward_func/mean": 0.25143373012542725, "rewards/grpo_reward_func/std": 0.13475444912910461, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.3125, "frac_reward_zero_std": 0.0, "grad_norm": 10.625, "kl": 0.0012225185928400606, "learning_rate": 1.824074074074074e-07, "loss": 0.0, "num_tokens": 10188807.0, "reward": 0.3598247468471527, "reward_std": 0.1472463756799698, "rewards/grpo_reward_func/mean": 0.3598247468471527, "rewards/grpo_reward_func/std": 0.15841133892536163, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.328125, "frac_reward_zero_std": 0.0, "grad_norm": 12.4375, "kl": 0.0006255771440919489, "learning_rate": 1.8148148148148149e-07, "loss": 0.0, "num_tokens": 10214127.0, "reward": 0.1664871722459793, "reward_std": 0.12487848103046417, "rewards/grpo_reward_func/mean": 0.1664871722459793, "rewards/grpo_reward_func/std": 0.22237654030323029, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.34375, "frac_reward_zero_std": 0.0, "grad_norm": 12.0625, "kl": 0.0012482001329772174, "learning_rate": 1.8055555555555554e-07, "loss": 0.0001, "num_tokens": 10238927.0, "reward": 0.48037129640579224, "reward_std": 0.12004883587360382, "rewards/grpo_reward_func/mean": 0.48037129640579224, "rewards/grpo_reward_func/std": 0.17319442331790924, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.359375, "frac_reward_zero_std": 0.0, "grad_norm": 7.5, "kl": 0.0008822223462630063, "learning_rate": 1.7962962962962964e-07, "loss": 0.0, "num_tokens": 10264383.0, "reward": 0.24905702471733093, "reward_std": 0.09374159574508667, "rewards/grpo_reward_func/mean": 0.24905702471733093, "rewards/grpo_reward_func/std": 0.20010940730571747, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.375, "frac_reward_zero_std": 0.0, "grad_norm": 14.0, "kl": 0.001847555220592767, "learning_rate": 1.787037037037037e-07, "loss": 0.0001, "num_tokens": 10289287.0, "reward": 0.3152793049812317, "reward_std": 0.08679287880659103, "rewards/grpo_reward_func/mean": 0.3152793049812317, "rewards/grpo_reward_func/std": 0.09012952446937561, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.390625, "frac_reward_zero_std": 0.0, "grad_norm": 12.875, "kl": 0.0023550866171717644, "learning_rate": 1.7777777777777776e-07, "loss": 0.0001, "num_tokens": 10314655.0, "reward": 0.2986000180244446, "reward_std": 0.16402404010295868, "rewards/grpo_reward_func/mean": 0.2986000180244446, "rewards/grpo_reward_func/std": 0.18015649914741516, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.40625, "frac_reward_zero_std": 0.0, "grad_norm": 12.5, "kl": 0.0007087271515047178, "learning_rate": 1.7685185185185184e-07, "loss": 0.0, "num_tokens": 10339839.0, "reward": 0.22273430228233337, "reward_std": 0.11841462552547455, "rewards/grpo_reward_func/mean": 0.22273430228233337, "rewards/grpo_reward_func/std": 0.13113521039485931, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.421875, "frac_reward_zero_std": 0.0, "grad_norm": 13.25, "kl": 0.0006067858485039324, "learning_rate": 1.7592592592592592e-07, "loss": 0.0, "num_tokens": 10364735.0, "reward": 0.28128859400749207, "reward_std": 0.1168629378080368, "rewards/grpo_reward_func/mean": 0.28128859400749207, "rewards/grpo_reward_func/std": 0.23241311311721802, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.4375, "frac_reward_zero_std": 0.0, "grad_norm": 11.6875, "kl": 0.0004597442748490721, "learning_rate": 1.75e-07, "loss": 0.0, "num_tokens": 10389631.0, "reward": 0.3766486644744873, "reward_std": 0.08330925554037094, "rewards/grpo_reward_func/mean": 0.3766486644744873, "rewards/grpo_reward_func/std": 0.09394894540309906, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.453125, "frac_reward_zero_std": 0.0, "grad_norm": 7.59375, "kl": 0.0014080870896577835, "learning_rate": 1.7407407407407407e-07, "loss": 0.0001, "num_tokens": 10415031.0, "reward": 0.14589783549308777, "reward_std": 0.1070764809846878, "rewards/grpo_reward_func/mean": 0.14589783549308777, "rewards/grpo_reward_func/std": 0.18981052935123444, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.46875, "frac_reward_zero_std": 0.0, "grad_norm": 12.625, "kl": 0.001625294506084174, "learning_rate": 1.7314814814814812e-07, "loss": 0.0001, "num_tokens": 10440183.0, "reward": 0.30828016996383667, "reward_std": 0.13814100623130798, "rewards/grpo_reward_func/mean": 0.30828016996383667, "rewards/grpo_reward_func/std": 0.1791732907295227, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.484375, "frac_reward_zero_std": 0.0, "grad_norm": 10.875, "kl": 0.0011470156605355442, "learning_rate": 1.7222222222222222e-07, "loss": 0.0, "num_tokens": 10465111.0, "reward": 0.555537760257721, "reward_std": 0.13992036879062653, "rewards/grpo_reward_func/mean": 0.555537760257721, "rewards/grpo_reward_func/std": 0.1409028172492981, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.5, "frac_reward_zero_std": 0.0, "grad_norm": 12.0, "kl": 0.0006981039041420445, "learning_rate": 1.712962962962963e-07, "loss": 0.0, "num_tokens": 10490231.0, "reward": 0.2912987172603607, "reward_std": 0.06564676761627197, "rewards/grpo_reward_func/mean": 0.2912987172603607, "rewards/grpo_reward_func/std": 0.09684650599956512, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.515625, "frac_reward_zero_std": 0.0, "grad_norm": 10.5, "kl": 0.001724751084111631, "learning_rate": 1.7037037037037035e-07, "loss": 0.0001, "num_tokens": 10515439.0, "reward": 0.2970144748687744, "reward_std": 0.09115941822528839, "rewards/grpo_reward_func/mean": 0.2970144748687744, "rewards/grpo_reward_func/std": 0.1823878437280655, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.53125, "frac_reward_zero_std": 0.0, "grad_norm": 11.375, "kl": 0.0006452612578868866, "learning_rate": 1.6944444444444445e-07, "loss": 0.0, "num_tokens": 10540311.0, "reward": 0.2946871221065521, "reward_std": 0.07320266216993332, "rewards/grpo_reward_func/mean": 0.2946871221065521, "rewards/grpo_reward_func/std": 0.10113289952278137, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.546875, "frac_reward_zero_std": 0.0, "grad_norm": 11.3125, "kl": 0.0011799820058513433, "learning_rate": 1.685185185185185e-07, "loss": 0.0, "num_tokens": 10565607.0, "reward": 0.36064761877059937, "reward_std": 0.12253247946500778, "rewards/grpo_reward_func/mean": 0.36064761877059937, "rewards/grpo_reward_func/std": 0.1580498218536377, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.5625, "frac_reward_zero_std": 0.0, "grad_norm": 11.375, "kl": 0.0006311101315077394, "learning_rate": 1.675925925925926e-07, "loss": 0.0, "num_tokens": 10590943.0, "reward": 0.255226194858551, "reward_std": 0.1812346875667572, "rewards/grpo_reward_func/mean": 0.255226194858551, "rewards/grpo_reward_func/std": 0.21465614438056946, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.578125, "frac_reward_zero_std": 0.0, "grad_norm": 10.3125, "kl": 0.0003934912383556366, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "num_tokens": 10616319.0, "reward": 0.23169803619384766, "reward_std": 0.11887459456920624, "rewards/grpo_reward_func/mean": 0.23169803619384766, "rewards/grpo_reward_func/std": 0.15518471598625183, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.59375, "frac_reward_zero_std": 0.0, "grad_norm": 11.75, "kl": 0.0007099388167262077, "learning_rate": 1.6574074074074073e-07, "loss": 0.0, "num_tokens": 10642207.0, "reward": 0.061446841806173325, "reward_std": 0.14731059968471527, "rewards/grpo_reward_func/mean": 0.061446841806173325, "rewards/grpo_reward_func/std": 0.14346599578857422, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.609375, "frac_reward_zero_std": 0.0, "grad_norm": 12.5, "kl": 0.0018291054293513298, "learning_rate": 1.648148148148148e-07, "loss": 0.0001, "num_tokens": 10667199.0, "reward": 0.2039181888103485, "reward_std": 0.17049936950206757, "rewards/grpo_reward_func/mean": 0.2039181888103485, "rewards/grpo_reward_func/std": 0.20998157560825348, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.625, "frac_reward_zero_std": 0.0, "grad_norm": 12.375, "kl": 0.0011474412167444825, "learning_rate": 1.6388888888888888e-07, "loss": 0.0, "num_tokens": 10692487.0, "reward": 0.26651886105537415, "reward_std": 0.10566580295562744, "rewards/grpo_reward_func/mean": 0.26651886105537415, "rewards/grpo_reward_func/std": 0.23722681403160095, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.640625, "frac_reward_zero_std": 0.0, "grad_norm": 13.5, "kl": 0.0013400282186921686, "learning_rate": 1.6296296296296298e-07, "loss": 0.0001, "num_tokens": 10717431.0, "reward": 0.4234394431114197, "reward_std": 0.11313052475452423, "rewards/grpo_reward_func/mean": 0.4234394431114197, "rewards/grpo_reward_func/std": 0.15678033232688904, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.65625, "frac_reward_zero_std": 0.0, "grad_norm": 10.4375, "kl": 0.0011549523624125868, "learning_rate": 1.6203703703703703e-07, "loss": 0.0, "num_tokens": 10743015.0, "reward": 0.20128034055233002, "reward_std": 0.09024003148078918, "rewards/grpo_reward_func/mean": 0.20128034055233002, "rewards/grpo_reward_func/std": 0.19719721376895905, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.671875, "frac_reward_zero_std": 0.0, "grad_norm": 5.90625, "kl": 0.0007055181486066431, "learning_rate": 1.611111111111111e-07, "loss": 0.0, "num_tokens": 10768159.0, "reward": 0.26648253202438354, "reward_std": 0.04378199204802513, "rewards/grpo_reward_func/mean": 0.26648253202438354, "rewards/grpo_reward_func/std": 0.05463617295026779, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.6875, "frac_reward_zero_std": 0.0, "grad_norm": 10.0, "kl": 0.0019033817807212472, "learning_rate": 1.6018518518518518e-07, "loss": 0.0001, "num_tokens": 10793215.0, "reward": 0.3865242600440979, "reward_std": 0.12939924001693726, "rewards/grpo_reward_func/mean": 0.3865242600440979, "rewards/grpo_reward_func/std": 0.13254022598266602, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.703125, "frac_reward_zero_std": 0.0, "grad_norm": 10.6875, "kl": 0.0012472507660277188, "learning_rate": 1.5925925925925926e-07, "loss": 0.0, "num_tokens": 10818415.0, "reward": 0.21385148167610168, "reward_std": 0.1188204437494278, "rewards/grpo_reward_func/mean": 0.21385148167610168, "rewards/grpo_reward_func/std": 0.19592618942260742, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.71875, "frac_reward_zero_std": 0.0, "grad_norm": 11.9375, "kl": 0.0010245550947729498, "learning_rate": 1.583333333333333e-07, "loss": 0.0, "num_tokens": 10843663.0, "reward": 0.2595471739768982, "reward_std": 0.127162903547287, "rewards/grpo_reward_func/mean": 0.2595471739768982, "rewards/grpo_reward_func/std": 0.1796990931034088, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.734375, "frac_reward_zero_std": 0.0, "grad_norm": 11.0625, "kl": 0.001687789976131171, "learning_rate": 1.574074074074074e-07, "loss": 0.0001, "num_tokens": 10869183.0, "reward": 0.1994100958108902, "reward_std": 0.09664750844240189, "rewards/grpo_reward_func/mean": 0.1994100958108902, "rewards/grpo_reward_func/std": 0.19897720217704773, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.75, "frac_reward_zero_std": 0.0, "grad_norm": 12.4375, "kl": 0.0014949360047467053, "learning_rate": 1.5648148148148146e-07, "loss": 0.0001, "num_tokens": 10894479.0, "reward": 0.24675852060317993, "reward_std": 0.10688143223524094, "rewards/grpo_reward_func/mean": 0.24675852060317993, "rewards/grpo_reward_func/std": 0.12910714745521545, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.765625, "frac_reward_zero_std": 0.0, "grad_norm": 13.125, "kl": 0.0012780396500602365, "learning_rate": 1.5555555555555556e-07, "loss": 0.0001, "num_tokens": 10919831.0, "reward": 0.2442871779203415, "reward_std": 0.10166030377149582, "rewards/grpo_reward_func/mean": 0.2442871779203415, "rewards/grpo_reward_func/std": 0.18765191733837128, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.78125, "frac_reward_zero_std": 0.0, "grad_norm": 11.375, "kl": 0.0011734996805898845, "learning_rate": 1.546296296296296e-07, "loss": 0.0, "num_tokens": 10945471.0, "reward": 0.21862854063510895, "reward_std": 0.09884025901556015, "rewards/grpo_reward_func/mean": 0.21862854063510895, "rewards/grpo_reward_func/std": 0.2449023574590683, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.796875, "frac_reward_zero_std": 0.0, "grad_norm": 8.0, "kl": 0.0006825063901487738, "learning_rate": 1.537037037037037e-07, "loss": 0.0, "num_tokens": 10970375.0, "reward": 0.3288189470767975, "reward_std": 0.1153157502412796, "rewards/grpo_reward_func/mean": 0.3288189470767975, "rewards/grpo_reward_func/std": 0.12331512570381165, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.8125, "frac_reward_zero_std": 0.0, "grad_norm": 9.9375, "kl": 0.0016119840438477695, "learning_rate": 1.527777777777778e-07, "loss": 0.0001, "num_tokens": 10995247.0, "reward": 0.43004822731018066, "reward_std": 0.14090153574943542, "rewards/grpo_reward_func/mean": 0.43004822731018066, "rewards/grpo_reward_func/std": 0.15082739293575287, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.828125, "frac_reward_zero_std": 0.0, "grad_norm": 13.75, "kl": 0.001201577513711527, "learning_rate": 1.5185185185185184e-07, "loss": 0.0, "num_tokens": 11020023.0, "reward": 0.4190066158771515, "reward_std": 0.11930587887763977, "rewards/grpo_reward_func/mean": 0.4190066158771515, "rewards/grpo_reward_func/std": 0.13784445822238922, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.84375, "frac_reward_zero_std": 0.0, "grad_norm": 12.5625, "kl": 0.00105038468609564, "learning_rate": 1.5092592592592592e-07, "loss": 0.0, "num_tokens": 11045703.0, "reward": 0.15409120917320251, "reward_std": 0.0602850615978241, "rewards/grpo_reward_func/mean": 0.15409120917320251, "rewards/grpo_reward_func/std": 0.09643904119729996, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.859375, "frac_reward_zero_std": 0.0, "grad_norm": 14.5625, "kl": 0.0010647746094036847, "learning_rate": 1.5e-07, "loss": 0.0, "num_tokens": 11070775.0, "reward": 0.23255158960819244, "reward_std": 0.121828094124794, "rewards/grpo_reward_func/mean": 0.23255158960819244, "rewards/grpo_reward_func/std": 0.18913201987743378, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.875, "frac_reward_zero_std": 0.0, "grad_norm": 8.625, "kl": 0.0009447056509088725, "learning_rate": 1.4907407407407407e-07, "loss": 0.0, "num_tokens": 11096191.0, "reward": 0.2456386387348175, "reward_std": 0.08084774017333984, "rewards/grpo_reward_func/mean": 0.2456386387348175, "rewards/grpo_reward_func/std": 0.189274862408638, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.890625, "frac_reward_zero_std": 0.0, "grad_norm": 13.875, "kl": 0.0006106716318754479, "learning_rate": 1.4814814814814815e-07, "loss": 0.0, "num_tokens": 11121111.0, "reward": 0.26635077595710754, "reward_std": 0.11237984895706177, "rewards/grpo_reward_func/mean": 0.26635077595710754, "rewards/grpo_reward_func/std": 0.14029628038406372, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.90625, "frac_reward_zero_std": 0.0, "grad_norm": 7.9375, "kl": 0.000595945239183493, "learning_rate": 1.4722222222222222e-07, "loss": 0.0, "num_tokens": 11146031.0, "reward": 0.3882697820663452, "reward_std": 0.11520197987556458, "rewards/grpo_reward_func/mean": 0.3882697820663452, "rewards/grpo_reward_func/std": 0.15360315144062042, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.921875, "frac_reward_zero_std": 0.0, "grad_norm": 12.9375, "kl": 0.0020495177595876157, "learning_rate": 1.4629629629629627e-07, "loss": 0.0001, "num_tokens": 11170975.0, "reward": 0.3387864828109741, "reward_std": 0.07305868715047836, "rewards/grpo_reward_func/mean": 0.3387864828109741, "rewards/grpo_reward_func/std": 0.1130770593881607, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.9375, "frac_reward_zero_std": 0.0, "grad_norm": 14.9375, "kl": 0.0012185790692456067, "learning_rate": 1.4537037037037037e-07, "loss": 0.0, "num_tokens": 11195919.0, "reward": 0.3070831000804901, "reward_std": 0.08975166827440262, "rewards/grpo_reward_func/mean": 0.3070831000804901, "rewards/grpo_reward_func/std": 0.0882553681731224, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.953125, "frac_reward_zero_std": 0.0, "grad_norm": 18.0, "kl": 0.002497857822163496, "learning_rate": 1.4444444444444442e-07, "loss": 0.0001, "num_tokens": 11221399.0, "reward": 0.3126865029335022, "reward_std": 0.13266904652118683, "rewards/grpo_reward_func/mean": 0.3126865029335022, "rewards/grpo_reward_func/std": 0.14260481297969818, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.96875, "frac_reward_zero_std": 0.0, "grad_norm": 18.5, "kl": 0.00145068543497473, "learning_rate": 1.435185185185185e-07, "loss": 0.0001, "num_tokens": 11246231.0, "reward": 0.36069953441619873, "reward_std": 0.10953323543071747, "rewards/grpo_reward_func/mean": 0.36069953441619873, "rewards/grpo_reward_func/std": 0.11150137335062027, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.984375, "frac_reward_zero_std": 0.0, "grad_norm": 10.5625, "kl": 0.0013835610006935894, "learning_rate": 1.425925925925926e-07, "loss": 0.0001, "num_tokens": 11271471.0, "reward": 0.3240332305431366, "reward_std": 0.09530281275510788, "rewards/grpo_reward_func/mean": 0.3240332305431366, "rewards/grpo_reward_func/std": 0.17097727954387665, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.0, "frac_reward_zero_std": 0.0, "grad_norm": 12.0, "kl": 0.0013482021167874336, "learning_rate": 1.4166666666666665e-07, "loss": 0.0001, "num_tokens": 11296711.0, "reward": 0.309320330619812, "reward_std": 0.0705028623342514, "rewards/grpo_reward_func/mean": 0.309320330619812, "rewards/grpo_reward_func/std": 0.2660787105560303, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.015625, "frac_reward_zero_std": 0.0, "grad_norm": 9.625, "kl": 0.0004683341830968857, "learning_rate": 1.4074074074074075e-07, "loss": 0.0, "num_tokens": 11322287.0, "reward": 0.19587868452072144, "reward_std": 0.08648732304573059, "rewards/grpo_reward_func/mean": 0.19587868452072144, "rewards/grpo_reward_func/std": 0.09603223204612732, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.03125, "frac_reward_zero_std": 0.0, "grad_norm": 8.3125, "kl": 0.0008416750060860068, "learning_rate": 1.398148148148148e-07, "loss": 0.0, "num_tokens": 11347087.0, "reward": 0.46222931146621704, "reward_std": 0.10860372334718704, "rewards/grpo_reward_func/mean": 0.46222931146621704, "rewards/grpo_reward_func/std": 0.15214873850345612, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.046875, "frac_reward_zero_std": 0.0, "grad_norm": 8.25, "kl": 0.0014006058045197278, "learning_rate": 1.3888888888888888e-07, "loss": 0.0001, "num_tokens": 11371863.0, "reward": 0.5111293792724609, "reward_std": 0.08110877871513367, "rewards/grpo_reward_func/mean": 0.5111293792724609, "rewards/grpo_reward_func/std": 0.09304346144199371, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.0625, "frac_reward_zero_std": 0.0, "grad_norm": 13.25, "kl": 0.0011988499463768676, "learning_rate": 1.3796296296296296e-07, "loss": 0.0, "num_tokens": 11396831.0, "reward": 0.4040873050689697, "reward_std": 0.09579525142908096, "rewards/grpo_reward_func/mean": 0.4040873050689697, "rewards/grpo_reward_func/std": 0.09277461469173431, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.078125, "frac_reward_zero_std": 0.0, "grad_norm": 7.6875, "kl": 0.0012973888660781085, "learning_rate": 1.3703703703703703e-07, "loss": 0.0001, "num_tokens": 11421887.0, "reward": 0.33569732308387756, "reward_std": 0.06503790616989136, "rewards/grpo_reward_func/mean": 0.33569732308387756, "rewards/grpo_reward_func/std": 0.07260998338460922, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.09375, "frac_reward_zero_std": 0.0, "grad_norm": 13.1875, "kl": 0.0012591605191119015, "learning_rate": 1.3611111111111108e-07, "loss": 0.0001, "num_tokens": 11446743.0, "reward": 0.46880820393562317, "reward_std": 0.12013055384159088, "rewards/grpo_reward_func/mean": 0.46880820393562317, "rewards/grpo_reward_func/std": 0.1322290599346161, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.109375, "frac_reward_zero_std": 0.0, "grad_norm": 10.125, "kl": 0.0012072750396328047, "learning_rate": 1.3518518518518518e-07, "loss": 0.0, "num_tokens": 11472823.0, "reward": 0.0363832488656044, "reward_std": 0.07413066923618317, "rewards/grpo_reward_func/mean": 0.0363832488656044, "rewards/grpo_reward_func/std": 0.10118604451417923, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.125, "frac_reward_zero_std": 0.0, "grad_norm": 10.125, "kl": 0.0018538168515078723, "learning_rate": 1.3425925925925926e-07, "loss": 0.0001, "num_tokens": 11498455.0, "reward": 0.1913381814956665, "reward_std": 0.0929805338382721, "rewards/grpo_reward_func/mean": 0.1913381814956665, "rewards/grpo_reward_func/std": 0.16615688800811768, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.140625, "frac_reward_zero_std": 0.0, "grad_norm": 12.9375, "kl": 0.0006855083629488945, "learning_rate": 1.3333333333333334e-07, "loss": 0.0, "num_tokens": 11523255.0, "reward": 0.27506938576698303, "reward_std": 0.10065136849880219, "rewards/grpo_reward_func/mean": 0.27506938576698303, "rewards/grpo_reward_func/std": 0.19226348400115967, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.15625, "frac_reward_zero_std": 0.0, "grad_norm": 10.4375, "kl": 0.00046966194349806756, "learning_rate": 1.324074074074074e-07, "loss": 0.0, "num_tokens": 11548327.0, "reward": 0.25183096528053284, "reward_std": 0.10666584968566895, "rewards/grpo_reward_func/mean": 0.25183096528053284, "rewards/grpo_reward_func/std": 0.1858406513929367, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.171875, "frac_reward_zero_std": 0.0, "grad_norm": 12.4375, "kl": 0.0009555828873999417, "learning_rate": 1.3148148148148146e-07, "loss": 0.0, "num_tokens": 11573839.0, "reward": 0.2436429113149643, "reward_std": 0.1274951547384262, "rewards/grpo_reward_func/mean": 0.2436429113149643, "rewards/grpo_reward_func/std": 0.19650402665138245, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.1875, "frac_reward_zero_std": 0.0, "grad_norm": 8.4375, "kl": 0.001801790960598737, "learning_rate": 1.3055555555555556e-07, "loss": 0.0001, "num_tokens": 11598935.0, "reward": 0.3082733154296875, "reward_std": 0.2006826102733612, "rewards/grpo_reward_func/mean": 0.3082733154296875, "rewards/grpo_reward_func/std": 0.24268022179603577, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.203125, "frac_reward_zero_std": 0.0, "grad_norm": 8.5, "kl": 0.0008424867992289364, "learning_rate": 1.2962962962962961e-07, "loss": 0.0, "num_tokens": 11623903.0, "reward": 0.36446842551231384, "reward_std": 0.10075566172599792, "rewards/grpo_reward_func/mean": 0.36446842551231384, "rewards/grpo_reward_func/std": 0.14413763582706451, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.21875, "frac_reward_zero_std": 0.0, "grad_norm": 11.3125, "kl": 0.00168600061442703, "learning_rate": 1.287037037037037e-07, "loss": 0.0001, "num_tokens": 11648591.0, "reward": 0.34162139892578125, "reward_std": 0.07324203103780746, "rewards/grpo_reward_func/mean": 0.34162139892578125, "rewards/grpo_reward_func/std": 0.09398888051509857, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.234375, "frac_reward_zero_std": 0.0, "grad_norm": 8.0625, "kl": 0.0011306122469250113, "learning_rate": 1.2777777777777777e-07, "loss": 0.0, "num_tokens": 11674071.0, "reward": 0.09499461203813553, "reward_std": 0.08520884811878204, "rewards/grpo_reward_func/mean": 0.09499461203813553, "rewards/grpo_reward_func/std": 0.10134407877922058, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.25, "frac_reward_zero_std": 0.0, "grad_norm": 8.8125, "kl": 0.0006733744667144492, "learning_rate": 1.2685185185185184e-07, "loss": 0.0, "num_tokens": 11699335.0, "reward": 0.2652754783630371, "reward_std": 0.09839694201946259, "rewards/grpo_reward_func/mean": 0.2652754783630371, "rewards/grpo_reward_func/std": 0.10382307320833206, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.265625, "frac_reward_zero_std": 0.0, "grad_norm": 14.0625, "kl": 0.0005735903978347778, "learning_rate": 1.2592592592592592e-07, "loss": 0.0, "num_tokens": 11725127.0, "reward": 0.12507686018943787, "reward_std": 0.14018288254737854, "rewards/grpo_reward_func/mean": 0.12507686018943787, "rewards/grpo_reward_func/std": 0.15515510737895966, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.28125, "frac_reward_zero_std": 0.0, "grad_norm": 8.3125, "kl": 0.002856824896298349, "learning_rate": 1.25e-07, "loss": 0.0001, "num_tokens": 11750423.0, "reward": 0.2550414800643921, "reward_std": 0.10255648195743561, "rewards/grpo_reward_func/mean": 0.2550414800643921, "rewards/grpo_reward_func/std": 0.18651345372200012, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.296875, "frac_reward_zero_std": 0.0, "grad_norm": 10.25, "kl": 0.000869341180077754, "learning_rate": 1.2407407407407407e-07, "loss": 0.0, "num_tokens": 11776615.0, "reward": 0.18968135118484497, "reward_std": 0.12408946454524994, "rewards/grpo_reward_func/mean": 0.18968135118484497, "rewards/grpo_reward_func/std": 0.2001882642507553, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.3125, "frac_reward_zero_std": 0.0, "grad_norm": 6.6875, "kl": 0.0007764963957015425, "learning_rate": 1.2314814814814815e-07, "loss": 0.0, "num_tokens": 11802007.0, "reward": 0.28070777654647827, "reward_std": 0.07643483579158783, "rewards/grpo_reward_func/mean": 0.28070777654647827, "rewards/grpo_reward_func/std": 0.08400604128837585, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.328125, "frac_reward_zero_std": 0.0, "grad_norm": 12.1875, "kl": 0.000670313835144043, "learning_rate": 1.2222222222222222e-07, "loss": 0.0, "num_tokens": 11827703.0, "reward": 0.09706881642341614, "reward_std": 0.062192559242248535, "rewards/grpo_reward_func/mean": 0.09706881642341614, "rewards/grpo_reward_func/std": 0.06070980429649353, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.34375, "frac_reward_zero_std": 0.0, "grad_norm": 10.625, "kl": 0.0012332831101957709, "learning_rate": 1.212962962962963e-07, "loss": 0.0, "num_tokens": 11852679.0, "reward": 0.29699277877807617, "reward_std": 0.10623517632484436, "rewards/grpo_reward_func/mean": 0.29699277877807617, "rewards/grpo_reward_func/std": 0.14658339321613312, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.359375, "frac_reward_zero_std": 0.0, "grad_norm": 6.71875, "kl": 0.0010181590914726257, "learning_rate": 1.2037037037037035e-07, "loss": 0.0, "num_tokens": 11878375.0, "reward": 0.21717457473278046, "reward_std": 0.0667777881026268, "rewards/grpo_reward_func/mean": 0.21717457473278046, "rewards/grpo_reward_func/std": 0.1625964194536209, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.375, "frac_reward_zero_std": 0.0, "grad_norm": 9.8125, "kl": 0.0016762353479862213, "learning_rate": 1.1944444444444445e-07, "loss": 0.0001, "num_tokens": 11903631.0, "reward": 0.22018520534038544, "reward_std": 0.10192622244358063, "rewards/grpo_reward_func/mean": 0.22018520534038544, "rewards/grpo_reward_func/std": 0.14623050391674042, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.390625, "frac_reward_zero_std": 0.0, "grad_norm": 12.5, "kl": 0.0004212940257275477, "learning_rate": 1.1851851851851851e-07, "loss": 0.0, "num_tokens": 11928839.0, "reward": 0.21051539480686188, "reward_std": 0.13067127764225006, "rewards/grpo_reward_func/mean": 0.21051539480686188, "rewards/grpo_reward_func/std": 0.21328914165496826, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.40625, "frac_reward_zero_std": 0.0, "grad_norm": 11.625, "kl": 0.0012048408098053187, "learning_rate": 1.1759259259259259e-07, "loss": 0.0, "num_tokens": 11954751.0, "reward": 0.05355652794241905, "reward_std": 0.10848183929920197, "rewards/grpo_reward_func/mean": 0.05355652794241905, "rewards/grpo_reward_func/std": 0.10706359893083572, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.421875, "frac_reward_zero_std": 0.0, "grad_norm": 10.25, "kl": 0.0014668001676909626, "learning_rate": 1.1666666666666667e-07, "loss": 0.0001, "num_tokens": 11979711.0, "reward": 0.38055509328842163, "reward_std": 0.097016841173172, "rewards/grpo_reward_func/mean": 0.38055509328842163, "rewards/grpo_reward_func/std": 0.17705973982810974, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.4375, "frac_reward_zero_std": 0.0, "grad_norm": 10.1875, "kl": 0.001055899978382513, "learning_rate": 1.1574074074074074e-07, "loss": 0.0, "num_tokens": 12005343.0, "reward": 0.2753819525241852, "reward_std": 0.10407428443431854, "rewards/grpo_reward_func/mean": 0.2753819525241852, "rewards/grpo_reward_func/std": 0.24588319659233093, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.453125, "frac_reward_zero_std": 0.0, "grad_norm": 13.75, "kl": 0.00115215900586918, "learning_rate": 1.148148148148148e-07, "loss": 0.0, "num_tokens": 12030439.0, "reward": 0.31799519062042236, "reward_std": 0.12696103751659393, "rewards/grpo_reward_func/mean": 0.31799519062042236, "rewards/grpo_reward_func/std": 0.14141467213630676, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.46875, "frac_reward_zero_std": 0.0, "grad_norm": 11.0, "kl": 0.0012049302167724818, "learning_rate": 1.1388888888888888e-07, "loss": 0.0, "num_tokens": 12055199.0, "reward": 0.31926023960113525, "reward_std": 0.12855856120586395, "rewards/grpo_reward_func/mean": 0.31926023960113525, "rewards/grpo_reward_func/std": 0.13934962451457977, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.484375, "frac_reward_zero_std": 0.0, "grad_norm": 10.8125, "kl": 0.0006446990446420386, "learning_rate": 1.1296296296296296e-07, "loss": 0.0, "num_tokens": 12080303.0, "reward": 0.2243320643901825, "reward_std": 0.10535985231399536, "rewards/grpo_reward_func/mean": 0.2243320643901825, "rewards/grpo_reward_func/std": 0.13603921234607697, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.5, "frac_reward_zero_std": 0.0, "grad_norm": 16.875, "kl": 0.0008130024070851505, "learning_rate": 1.1203703703703703e-07, "loss": 0.0, "num_tokens": 12105327.0, "reward": 0.3918301463127136, "reward_std": 0.12956207990646362, "rewards/grpo_reward_func/mean": 0.3918301463127136, "rewards/grpo_reward_func/std": 0.14622493088245392, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.515625, "frac_reward_zero_std": 0.0, "grad_norm": 11.0625, "kl": 0.001358548819553107, "learning_rate": 1.111111111111111e-07, "loss": 0.0001, "num_tokens": 12130511.0, "reward": 0.2851361334323883, "reward_std": 0.1555725485086441, "rewards/grpo_reward_func/mean": 0.2851361334323883, "rewards/grpo_reward_func/std": 0.1634860336780548, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.53125, "frac_reward_zero_std": 0.0, "grad_norm": 9.4375, "kl": 0.0005101002752780914, "learning_rate": 1.1018518518518519e-07, "loss": 0.0, "num_tokens": 12156063.0, "reward": 0.12298320978879929, "reward_std": 0.05867529287934303, "rewards/grpo_reward_func/mean": 0.12298320978879929, "rewards/grpo_reward_func/std": 0.06098959594964981, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.546875, "frac_reward_zero_std": 0.0, "grad_norm": 11.5, "kl": 0.0007716265245107934, "learning_rate": 1.0925925925925926e-07, "loss": 0.0, "num_tokens": 12181463.0, "reward": 0.149795800447464, "reward_std": 0.10114938765764236, "rewards/grpo_reward_func/mean": 0.149795800447464, "rewards/grpo_reward_func/std": 0.11674728989601135, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.5625, "frac_reward_zero_std": 0.0, "grad_norm": 12.6875, "kl": 0.0016588344587944448, "learning_rate": 1.0833333333333334e-07, "loss": 0.0001, "num_tokens": 12206231.0, "reward": 0.35547971725463867, "reward_std": 0.12198421359062195, "rewards/grpo_reward_func/mean": 0.35547971725463867, "rewards/grpo_reward_func/std": 0.14273548126220703, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.578125, "frac_reward_zero_std": 0.0, "grad_norm": 10.9375, "kl": 0.0024268822162412107, "learning_rate": 1.074074074074074e-07, "loss": 0.0001, "num_tokens": 12231703.0, "reward": 0.28086715936660767, "reward_std": 0.13494156301021576, "rewards/grpo_reward_func/mean": 0.28086715936660767, "rewards/grpo_reward_func/std": 0.22161196172237396, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.59375, "frac_reward_zero_std": 0.0, "grad_norm": 8.5625, "kl": 0.0006409324705600739, "learning_rate": 1.0648148148148148e-07, "loss": 0.0, "num_tokens": 12256543.0, "reward": 0.317976176738739, "reward_std": 0.06494971364736557, "rewards/grpo_reward_func/mean": 0.317976176738739, "rewards/grpo_reward_func/std": 0.0717727318406105, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.609375, "frac_reward_zero_std": 0.0, "grad_norm": 10.6875, "kl": 0.0014852539461571723, "learning_rate": 1.0555555555555555e-07, "loss": 0.0001, "num_tokens": 12281863.0, "reward": 0.18861307203769684, "reward_std": 0.17945444583892822, "rewards/grpo_reward_func/mean": 0.18861307203769684, "rewards/grpo_reward_func/std": 0.1798945814371109, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.625, "frac_reward_zero_std": 0.0, "grad_norm": 9.8125, "kl": 0.0006892156670801342, "learning_rate": 1.0462962962962963e-07, "loss": 0.0, "num_tokens": 12306919.0, "reward": 0.30536431074142456, "reward_std": 0.15281975269317627, "rewards/grpo_reward_func/mean": 0.30536431074142456, "rewards/grpo_reward_func/std": 0.17713572084903717, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.640625, "frac_reward_zero_std": 0.0, "grad_norm": 14.6875, "kl": 0.000986199505859986, "learning_rate": 1.0370370370370369e-07, "loss": 0.0, "num_tokens": 12332351.0, "reward": 0.14482443034648895, "reward_std": 0.17538149654865265, "rewards/grpo_reward_func/mean": 0.14482443034648895, "rewards/grpo_reward_func/std": 0.20862267911434174, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.65625, "frac_reward_zero_std": 0.0, "grad_norm": 10.75, "kl": 0.0005291861889418215, "learning_rate": 1.0277777777777777e-07, "loss": 0.0, "num_tokens": 12357783.0, "reward": 0.24565353989601135, "reward_std": 0.07573194801807404, "rewards/grpo_reward_func/mean": 0.24565353989601135, "rewards/grpo_reward_func/std": 0.17366138100624084, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.671875, "frac_reward_zero_std": 0.0, "grad_norm": 10.625, "kl": 0.0012580165202962235, "learning_rate": 1.0185185185185184e-07, "loss": 0.0001, "num_tokens": 12382783.0, "reward": 0.28701528906822205, "reward_std": 0.13948951661586761, "rewards/grpo_reward_func/mean": 0.28701528906822205, "rewards/grpo_reward_func/std": 0.1597539633512497, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.6875, "frac_reward_zero_std": 0.0, "grad_norm": 9.4375, "kl": 0.0010586930438876152, "learning_rate": 1.0092592592592593e-07, "loss": 0.0, "num_tokens": 12408327.0, "reward": 0.15862220525741577, "reward_std": 0.0548260323703289, "rewards/grpo_reward_func/mean": 0.15862220525741577, "rewards/grpo_reward_func/std": 0.12198863923549652, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.703125, "frac_reward_zero_std": 0.0, "grad_norm": 8.4375, "kl": 0.0013896317104808986, "learning_rate": 1e-07, "loss": 0.0001, "num_tokens": 12433679.0, "reward": 0.32602596282958984, "reward_std": 0.13098275661468506, "rewards/grpo_reward_func/mean": 0.32602596282958984, "rewards/grpo_reward_func/std": 0.2107134908437729, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.71875, "frac_reward_zero_std": 0.0, "grad_norm": 14.75, "kl": 0.0025586048141121864, "learning_rate": 9.907407407407407e-08, "loss": 0.0001, "num_tokens": 12458655.0, "reward": 0.3703271150588989, "reward_std": 0.09507998824119568, "rewards/grpo_reward_func/mean": 0.3703271150588989, "rewards/grpo_reward_func/std": 0.14610642194747925, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.734375, "frac_reward_zero_std": 0.0, "grad_norm": 6.0625, "kl": 0.0011041508987545967, "learning_rate": 9.814814814814815e-08, "loss": 0.0, "num_tokens": 12483567.0, "reward": 0.3221014738082886, "reward_std": 0.06510960310697556, "rewards/grpo_reward_func/mean": 0.3221014738082886, "rewards/grpo_reward_func/std": 0.07591387629508972, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.75, "frac_reward_zero_std": 0.0, "grad_norm": 12.6875, "kl": 0.0013404401834122837, "learning_rate": 9.722222222222222e-08, "loss": 0.0001, "num_tokens": 12508847.0, "reward": 0.32358843088150024, "reward_std": 0.14844445884227753, "rewards/grpo_reward_func/mean": 0.32358843088150024, "rewards/grpo_reward_func/std": 0.14955370128154755, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.765625, "frac_reward_zero_std": 0.0, "grad_norm": 6.1875, "kl": 0.0013865028013242409, "learning_rate": 9.629629629629629e-08, "loss": 0.0001, "num_tokens": 12534183.0, "reward": 0.29313117265701294, "reward_std": 0.05017620697617531, "rewards/grpo_reward_func/mean": 0.29313117265701294, "rewards/grpo_reward_func/std": 0.11477138102054596, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.78125, "frac_reward_zero_std": 0.0, "grad_norm": 9.5625, "kl": 0.001455625839298591, "learning_rate": 9.537037037037036e-08, "loss": 0.0001, "num_tokens": 12559055.0, "reward": 0.32869526743888855, "reward_std": 0.11172797530889511, "rewards/grpo_reward_func/mean": 0.32869526743888855, "rewards/grpo_reward_func/std": 0.12100888043642044, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.796875, "frac_reward_zero_std": 0.0, "grad_norm": 10.375, "kl": 0.000938947923714295, "learning_rate": 9.444444444444444e-08, "loss": 0.0, "num_tokens": 12583935.0, "reward": 0.41716277599334717, "reward_std": 0.1078467071056366, "rewards/grpo_reward_func/mean": 0.41716277599334717, "rewards/grpo_reward_func/std": 0.11082032322883606, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.8125, "frac_reward_zero_std": 0.0, "grad_norm": 10.25, "kl": 0.0005020884127588943, "learning_rate": 9.351851851851851e-08, "loss": 0.0, "num_tokens": 12609103.0, "reward": 0.2469130903482437, "reward_std": 0.07369641214609146, "rewards/grpo_reward_func/mean": 0.2469130903482437, "rewards/grpo_reward_func/std": 0.13795331120491028, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.828125, "frac_reward_zero_std": 0.0, "grad_norm": 13.0, "kl": 0.0005307138635544106, "learning_rate": 9.259259259259258e-08, "loss": 0.0, "num_tokens": 12634127.0, "reward": 0.404737651348114, "reward_std": 0.14841783046722412, "rewards/grpo_reward_func/mean": 0.404737651348114, "rewards/grpo_reward_func/std": 0.15671268105506897, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.84375, "frac_reward_zero_std": 0.0, "grad_norm": 8.6875, "kl": 0.000757849462388549, "learning_rate": 9.166666666666665e-08, "loss": 0.0, "num_tokens": 12659767.0, "reward": 0.14257104694843292, "reward_std": 0.07553324848413467, "rewards/grpo_reward_func/mean": 0.14257104694843292, "rewards/grpo_reward_func/std": 0.09304811805486679, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.859375, "frac_reward_zero_std": 0.0, "grad_norm": 11.3125, "kl": 0.001406958035659045, "learning_rate": 9.074074074074074e-08, "loss": 0.0001, "num_tokens": 12685103.0, "reward": 0.2559998333454132, "reward_std": 0.12699110805988312, "rewards/grpo_reward_func/mean": 0.2559998333454132, "rewards/grpo_reward_func/std": 0.2978763282299042, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.875, "frac_reward_zero_std": 0.0, "grad_norm": 9.125, "kl": 0.0007011145353317261, "learning_rate": 8.981481481481482e-08, "loss": 0.0, "num_tokens": 12709967.0, "reward": 0.4325105547904968, "reward_std": 0.07664240151643753, "rewards/grpo_reward_func/mean": 0.4325105547904968, "rewards/grpo_reward_func/std": 0.1956201046705246, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.890625, "frac_reward_zero_std": 0.0, "grad_norm": 12.6875, "kl": 0.002911916351877153, "learning_rate": 8.888888888888888e-08, "loss": 0.0001, "num_tokens": 12734967.0, "reward": 0.34496262669563293, "reward_std": 0.12534165382385254, "rewards/grpo_reward_func/mean": 0.34496262669563293, "rewards/grpo_reward_func/std": 0.12190457433462143, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.90625, "frac_reward_zero_std": 0.0, "grad_norm": 5.875, "kl": 0.0007352257671300322, "learning_rate": 8.796296296296296e-08, "loss": 0.0, "num_tokens": 12760111.0, "reward": 0.3680327236652374, "reward_std": 0.07478898763656616, "rewards/grpo_reward_func/mean": 0.3680327236652374, "rewards/grpo_reward_func/std": 0.1823076754808426, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.921875, "frac_reward_zero_std": 0.0, "grad_norm": 15.5625, "kl": 0.0011474639468360692, "learning_rate": 8.703703703703703e-08, "loss": 0.0, "num_tokens": 12785271.0, "reward": 0.19895240664482117, "reward_std": 0.16426034271717072, "rewards/grpo_reward_func/mean": 0.19895240664482117, "rewards/grpo_reward_func/std": 0.18529564142227173, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.9375, "frac_reward_zero_std": 0.0, "grad_norm": 12.0, "kl": 0.0006676611083094031, "learning_rate": 8.611111111111111e-08, "loss": 0.0, "num_tokens": 12810359.0, "reward": 0.21774883568286896, "reward_std": 0.11167119443416595, "rewards/grpo_reward_func/mean": 0.21774883568286896, "rewards/grpo_reward_func/std": 0.19753259420394897, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.953125, "frac_reward_zero_std": 0.0, "grad_norm": 16.125, "kl": 0.0012378571555018425, "learning_rate": 8.518518518518517e-08, "loss": 0.0, "num_tokens": 12835495.0, "reward": 0.21574603021144867, "reward_std": 0.16766542196273804, "rewards/grpo_reward_func/mean": 0.21574603021144867, "rewards/grpo_reward_func/std": 0.16853067278862, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.96875, "frac_reward_zero_std": 0.0, "grad_norm": 8.25, "kl": 0.0005606446939054877, "learning_rate": 8.425925925925925e-08, "loss": 0.0, "num_tokens": 12860463.0, "reward": 0.3914608359336853, "reward_std": 0.127981498837471, "rewards/grpo_reward_func/mean": 0.3914608359336853, "rewards/grpo_reward_func/std": 0.17107483744621277, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.984375, "frac_reward_zero_std": 0.0, "grad_norm": 10.375, "kl": 0.0006222097872523591, "learning_rate": 8.333333333333333e-08, "loss": 0.0, "num_tokens": 12885399.0, "reward": 0.37283188104629517, "reward_std": 0.1533588469028473, "rewards/grpo_reward_func/mean": 0.37283188104629517, "rewards/grpo_reward_func/std": 0.16130177676677704, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.0, "frac_reward_zero_std": 0.0, "grad_norm": 12.4375, "kl": 0.0014068341115489602, "learning_rate": 8.24074074074074e-08, "loss": 0.0001, "num_tokens": 12910527.0, "reward": 0.40698957443237305, "reward_std": 0.09445726871490479, "rewards/grpo_reward_func/mean": 0.40698957443237305, "rewards/grpo_reward_func/std": 0.11586925387382507, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.015625, "frac_reward_zero_std": 0.0, "grad_norm": 11.375, "kl": 0.001016762078506872, "learning_rate": 8.148148148148149e-08, "loss": 0.0, "num_tokens": 12936015.0, "reward": 0.28052544593811035, "reward_std": 0.10798147320747375, "rewards/grpo_reward_func/mean": 0.28052544593811035, "rewards/grpo_reward_func/std": 0.2348252683877945, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.03125, "frac_reward_zero_std": 0.0, "grad_norm": 9.0, "kl": 0.0010810171370394528, "learning_rate": 8.055555555555555e-08, "loss": 0.0, "num_tokens": 12961375.0, "reward": 0.4133640229701996, "reward_std": 0.1525183767080307, "rewards/grpo_reward_func/mean": 0.4133640229701996, "rewards/grpo_reward_func/std": 0.1891576498746872, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.046875, "frac_reward_zero_std": 0.0, "grad_norm": 8.4375, "kl": 0.0010025454976130277, "learning_rate": 7.962962962962963e-08, "loss": 0.0, "num_tokens": 12986327.0, "reward": 0.43144044280052185, "reward_std": 0.09400911629199982, "rewards/grpo_reward_func/mean": 0.43144044280052185, "rewards/grpo_reward_func/std": 0.15548229217529297, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.0625, "frac_reward_zero_std": 0.0, "grad_norm": 11.375, "kl": 0.0005061101837782189, "learning_rate": 7.87037037037037e-08, "loss": 0.0, "num_tokens": 13011303.0, "reward": 0.42609304189682007, "reward_std": 0.16274592280387878, "rewards/grpo_reward_func/mean": 0.42609304189682007, "rewards/grpo_reward_func/std": 0.18850740790367126, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.078125, "frac_reward_zero_std": 0.0, "grad_norm": 13.875, "kl": 0.0012816967209801078, "learning_rate": 7.777777777777778e-08, "loss": 0.0001, "num_tokens": 13036479.0, "reward": 0.29316410422325134, "reward_std": 0.1588841676712036, "rewards/grpo_reward_func/mean": 0.29316410422325134, "rewards/grpo_reward_func/std": 0.25689324736595154, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.09375, "frac_reward_zero_std": 0.0, "grad_norm": 15.0, "kl": 0.0006455080583691597, "learning_rate": 7.685185185185184e-08, "loss": 0.0, "num_tokens": 13061303.0, "reward": 0.3597118854522705, "reward_std": 0.15937989950180054, "rewards/grpo_reward_func/mean": 0.3597118854522705, "rewards/grpo_reward_func/std": 0.2896607220172882, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.109375, "frac_reward_zero_std": 0.0, "grad_norm": 9.75, "kl": 0.0010962964443024248, "learning_rate": 7.592592592592592e-08, "loss": 0.0, "num_tokens": 13086415.0, "reward": 0.34907934069633484, "reward_std": 0.1042337566614151, "rewards/grpo_reward_func/mean": 0.34907934069633484, "rewards/grpo_reward_func/std": 0.12016221135854721, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.125, "frac_reward_zero_std": 0.0, "grad_norm": 8.6875, "kl": 0.0007564701663795859, "learning_rate": 7.5e-08, "loss": 0.0, "num_tokens": 13112343.0, "reward": 0.10363875329494476, "reward_std": 0.06262201070785522, "rewards/grpo_reward_func/mean": 0.10363875329494476, "rewards/grpo_reward_func/std": 0.0699472650885582, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.140625, "frac_reward_zero_std": 0.0, "grad_norm": 11.375, "kl": 0.0012816584785468876, "learning_rate": 7.407407407407407e-08, "loss": 0.0001, "num_tokens": 13137687.0, "reward": 0.23938968777656555, "reward_std": 0.1415923833847046, "rewards/grpo_reward_func/mean": 0.23938968777656555, "rewards/grpo_reward_func/std": 0.2066323161125183, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.15625, "frac_reward_zero_std": 0.0, "grad_norm": 11.6875, "kl": 0.0007490174029953778, "learning_rate": 7.314814814814814e-08, "loss": 0.0, "num_tokens": 13162671.0, "reward": 0.3724169135093689, "reward_std": 0.1446247100830078, "rewards/grpo_reward_func/mean": 0.3724169135093689, "rewards/grpo_reward_func/std": 0.1447208970785141, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.171875, "frac_reward_zero_std": 0.0, "grad_norm": 8.0625, "kl": 0.0015134557033888996, "learning_rate": 7.222222222222221e-08, "loss": 0.0001, "num_tokens": 13187423.0, "reward": 0.3491423428058624, "reward_std": 0.10657566785812378, "rewards/grpo_reward_func/mean": 0.3491423428058624, "rewards/grpo_reward_func/std": 0.12669718265533447, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.1875, "frac_reward_zero_std": 0.0, "grad_norm": 8.75, "kl": 0.000962267949944362, "learning_rate": 7.12962962962963e-08, "loss": 0.0, "num_tokens": 13212367.0, "reward": 0.35430532693862915, "reward_std": 0.05485477298498154, "rewards/grpo_reward_func/mean": 0.35430532693862915, "rewards/grpo_reward_func/std": 0.05748599022626877, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.203125, "frac_reward_zero_std": 0.0, "grad_norm": 9.875, "kl": 0.0007257629185914993, "learning_rate": 7.037037037037038e-08, "loss": 0.0, "num_tokens": 13237967.0, "reward": 0.15453723073005676, "reward_std": 0.1196342259645462, "rewards/grpo_reward_func/mean": 0.15453723073005676, "rewards/grpo_reward_func/std": 0.13181230425834656, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.21875, "frac_reward_zero_std": 0.0, "grad_norm": 12.0, "kl": 0.0010517633927520365, "learning_rate": 6.944444444444444e-08, "loss": 0.0, "num_tokens": 13263071.0, "reward": 0.2398907095193863, "reward_std": 0.08424936234951019, "rewards/grpo_reward_func/mean": 0.2398907095193863, "rewards/grpo_reward_func/std": 0.13464602828025818, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.234375, "frac_reward_zero_std": 0.0, "grad_norm": 11.125, "kl": 0.0009952950931619853, "learning_rate": 6.851851851851852e-08, "loss": 0.0, "num_tokens": 13288159.0, "reward": 0.32014644145965576, "reward_std": 0.11366228759288788, "rewards/grpo_reward_func/mean": 0.32014644145965576, "rewards/grpo_reward_func/std": 0.1127706915140152, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.25, "frac_reward_zero_std": 0.0, "grad_norm": 11.875, "kl": 0.0003101810143562034, "learning_rate": 6.759259259259259e-08, "loss": 0.0, "num_tokens": 13313511.0, "reward": 0.30525702238082886, "reward_std": 0.1390095353126526, "rewards/grpo_reward_func/mean": 0.30525702238082886, "rewards/grpo_reward_func/std": 0.15475143492221832, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.265625, "frac_reward_zero_std": 0.0, "grad_norm": 12.1875, "kl": 0.0013495491002686322, "learning_rate": 6.666666666666667e-08, "loss": 0.0001, "num_tokens": 13339079.0, "reward": 0.1646089255809784, "reward_std": 0.06642314791679382, "rewards/grpo_reward_func/mean": 0.1646089255809784, "rewards/grpo_reward_func/std": 0.06487837433815002, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.28125, "frac_reward_zero_std": 0.0, "grad_norm": 10.1875, "kl": 0.0018139145686291158, "learning_rate": 6.574074074074073e-08, "loss": 0.0001, "num_tokens": 13364703.0, "reward": 0.21735727787017822, "reward_std": 0.10561822354793549, "rewards/grpo_reward_func/mean": 0.21735727787017822, "rewards/grpo_reward_func/std": 0.20538462698459625, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.296875, "frac_reward_zero_std": 0.0, "grad_norm": 10.625, "kl": 0.0035593643551692367, "learning_rate": 6.481481481481481e-08, "loss": 0.0001, "num_tokens": 13389807.0, "reward": 0.2676432132720947, "reward_std": 0.10668302327394485, "rewards/grpo_reward_func/mean": 0.2676432132720947, "rewards/grpo_reward_func/std": 0.15589383244514465, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.3125, "frac_reward_zero_std": 0.0, "grad_norm": 9.875, "kl": 0.001778287230990827, "learning_rate": 6.388888888888888e-08, "loss": 0.0001, "num_tokens": 13415567.0, "reward": 0.17674584686756134, "reward_std": 0.09293629974126816, "rewards/grpo_reward_func/mean": 0.17674584686756134, "rewards/grpo_reward_func/std": 0.1887131929397583, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.328125, "frac_reward_zero_std": 0.0, "grad_norm": 12.1875, "kl": 0.000783468916779384, "learning_rate": 6.296296296296296e-08, "loss": 0.0, "num_tokens": 13440455.0, "reward": 0.24244914948940277, "reward_std": 0.07908381521701813, "rewards/grpo_reward_func/mean": 0.24244914948940277, "rewards/grpo_reward_func/std": 0.08193695545196533, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.34375, "frac_reward_zero_std": 0.0, "grad_norm": 8.875, "kl": 0.0005902101693209261, "learning_rate": 6.203703703703704e-08, "loss": 0.0, "num_tokens": 13465431.0, "reward": 0.42329832911491394, "reward_std": 0.09570271521806717, "rewards/grpo_reward_func/mean": 0.42329832911491394, "rewards/grpo_reward_func/std": 0.18416033685207367, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.359375, "frac_reward_zero_std": 0.0, "grad_norm": 4.75, "kl": 0.0003727572038769722, "learning_rate": 6.111111111111111e-08, "loss": 0.0, "num_tokens": 13490463.0, "reward": 0.38040339946746826, "reward_std": 0.04217088967561722, "rewards/grpo_reward_func/mean": 0.38040339946746826, "rewards/grpo_reward_func/std": 0.05930864065885544, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.375, "frac_reward_zero_std": 0.0, "grad_norm": 8.4375, "kl": 0.001078493587556295, "learning_rate": 6.018518518518517e-08, "loss": 0.0, "num_tokens": 13515199.0, "reward": 0.39346325397491455, "reward_std": 0.05055631697177887, "rewards/grpo_reward_func/mean": 0.39346325397491455, "rewards/grpo_reward_func/std": 0.06339211761951447, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.390625, "frac_reward_zero_std": 0.0, "grad_norm": 9.6875, "kl": 0.0013342679594643414, "learning_rate": 5.925925925925926e-08, "loss": 0.0001, "num_tokens": 13540535.0, "reward": 0.31230786442756653, "reward_std": 0.12040304392576218, "rewards/grpo_reward_func/mean": 0.31230786442756653, "rewards/grpo_reward_func/std": 0.2622143626213074, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.40625, "frac_reward_zero_std": 0.0, "grad_norm": 11.0625, "kl": 0.002153427602024749, "learning_rate": 5.833333333333333e-08, "loss": 0.0001, "num_tokens": 13565415.0, "reward": 0.3166807293891907, "reward_std": 0.10012571513652802, "rewards/grpo_reward_func/mean": 0.3166807293891907, "rewards/grpo_reward_func/std": 0.10676856338977814, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.421875, "frac_reward_zero_std": 0.0, "grad_norm": 8.0625, "kl": 0.0007572459580842406, "learning_rate": 5.74074074074074e-08, "loss": 0.0, "num_tokens": 13591375.0, "reward": 0.0935957282781601, "reward_std": 0.09053429961204529, "rewards/grpo_reward_func/mean": 0.0935957282781601, "rewards/grpo_reward_func/std": 0.09107687324285507, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.4375, "frac_reward_zero_std": 0.0, "grad_norm": 9.875, "kl": 0.0015325732820201665, "learning_rate": 5.648148148148148e-08, "loss": 0.0001, "num_tokens": 13616223.0, "reward": 0.40333813428878784, "reward_std": 0.06387582421302795, "rewards/grpo_reward_func/mean": 0.40333813428878784, "rewards/grpo_reward_func/std": 0.15431678295135498, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.453125, "frac_reward_zero_std": 0.0, "grad_norm": 9.5, "kl": 0.0007481267966795713, "learning_rate": 5.555555555555555e-08, "loss": 0.0, "num_tokens": 13641055.0, "reward": 0.45801687240600586, "reward_std": 0.08247587084770203, "rewards/grpo_reward_func/mean": 0.45801687240600586, "rewards/grpo_reward_func/std": 0.08276832848787308, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.46875, "frac_reward_zero_std": 0.0, "grad_norm": 15.1875, "kl": 0.0021362415864132345, "learning_rate": 5.462962962962963e-08, "loss": 0.0001, "num_tokens": 13665991.0, "reward": 0.4474378824234009, "reward_std": 0.13480228185653687, "rewards/grpo_reward_func/mean": 0.4474378824234009, "rewards/grpo_reward_func/std": 0.1332484483718872, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.484375, "frac_reward_zero_std": 0.0, "grad_norm": 14.875, "kl": 0.0008189262007363141, "learning_rate": 5.37037037037037e-08, "loss": 0.0, "num_tokens": 13691007.0, "reward": 0.30340880155563354, "reward_std": 0.15250803530216217, "rewards/grpo_reward_func/mean": 0.30340880155563354, "rewards/grpo_reward_func/std": 0.21679265797138214, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.5, "frac_reward_zero_std": 0.0, "grad_norm": 23.75, "kl": 0.003065172815695405, "learning_rate": 5.2777777777777776e-08, "loss": 0.0001, "num_tokens": 13716575.0, "reward": 0.2417331337928772, "reward_std": 0.14743976294994354, "rewards/grpo_reward_func/mean": 0.2417331337928772, "rewards/grpo_reward_func/std": 0.2199942171573639, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.515625, "frac_reward_zero_std": 0.0, "grad_norm": 10.9375, "kl": 0.0016453542630188167, "learning_rate": 5.1851851851851846e-08, "loss": 0.0001, "num_tokens": 13742175.0, "reward": 0.16955968737602234, "reward_std": 0.1037115603685379, "rewards/grpo_reward_func/mean": 0.16955968737602234, "rewards/grpo_reward_func/std": 0.16303950548171997, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.53125, "frac_reward_zero_std": 0.0, "grad_norm": 10.375, "kl": 0.000976884097326547, "learning_rate": 5.092592592592592e-08, "loss": 0.0, "num_tokens": 13767983.0, "reward": 0.16923588514328003, "reward_std": 0.1803411841392517, "rewards/grpo_reward_func/mean": 0.16923588514328003, "rewards/grpo_reward_func/std": 0.19898511469364166, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.546875, "frac_reward_zero_std": 0.0, "grad_norm": 14.5, "kl": 0.0012220792996231467, "learning_rate": 5e-08, "loss": 0.0, "num_tokens": 13792751.0, "reward": 0.40972161293029785, "reward_std": 0.12056250870227814, "rewards/grpo_reward_func/mean": 0.40972161293029785, "rewards/grpo_reward_func/std": 0.11885331571102142, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.5625, "frac_reward_zero_std": 0.0, "grad_norm": 10.6875, "kl": 0.0010280332644470036, "learning_rate": 4.9074074074074074e-08, "loss": 0.0, "num_tokens": 13817871.0, "reward": 0.4193640649318695, "reward_std": 0.10699457675218582, "rewards/grpo_reward_func/mean": 0.4193640649318695, "rewards/grpo_reward_func/std": 0.10446963459253311, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.578125, "frac_reward_zero_std": 0.0, "grad_norm": 11.6875, "kl": 0.0008616912818979472, "learning_rate": 4.814814814814814e-08, "loss": 0.0, "num_tokens": 13843359.0, "reward": 0.2062494158744812, "reward_std": 0.1384025514125824, "rewards/grpo_reward_func/mean": 0.2062494158744812, "rewards/grpo_reward_func/std": 0.23651915788650513, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.59375, "frac_reward_zero_std": 0.0, "grad_norm": 13.8125, "kl": 0.0014753906289115548, "learning_rate": 4.722222222222222e-08, "loss": 0.0001, "num_tokens": 13868591.0, "reward": 0.30149251222610474, "reward_std": 0.1228085309267044, "rewards/grpo_reward_func/mean": 0.30149251222610474, "rewards/grpo_reward_func/std": 0.2368825227022171, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.609375, "frac_reward_zero_std": 0.0, "grad_norm": 8.6875, "kl": 0.0016866040241438895, "learning_rate": 4.629629629629629e-08, "loss": 0.0001, "num_tokens": 13893495.0, "reward": 0.39722514152526855, "reward_std": 0.0929851084947586, "rewards/grpo_reward_func/mean": 0.39722514152526855, "rewards/grpo_reward_func/std": 0.09578373283147812, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.625, "frac_reward_zero_std": 0.0, "grad_norm": 7.53125, "kl": 0.0008672124822624028, "learning_rate": 4.537037037037037e-08, "loss": 0.0, "num_tokens": 13919215.0, "reward": 0.15501129627227783, "reward_std": 0.05672089755535126, "rewards/grpo_reward_func/mean": 0.15501129627227783, "rewards/grpo_reward_func/std": 0.06247472018003464, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.640625, "frac_reward_zero_std": 0.0, "grad_norm": 9.6875, "kl": 0.0009078836592379957, "learning_rate": 4.444444444444444e-08, "loss": 0.0, "num_tokens": 13944183.0, "reward": 0.29007142782211304, "reward_std": 0.13152144849300385, "rewards/grpo_reward_func/mean": 0.29007142782211304, "rewards/grpo_reward_func/std": 0.14510175585746765, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.65625, "frac_reward_zero_std": 0.0, "grad_norm": 7.34375, "kl": 0.0005848153232363984, "learning_rate": 4.351851851851852e-08, "loss": 0.0, "num_tokens": 13969375.0, "reward": 0.37974289059638977, "reward_std": 0.09775005280971527, "rewards/grpo_reward_func/mean": 0.37974289059638977, "rewards/grpo_reward_func/std": 0.19289840757846832, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.671875, "frac_reward_zero_std": 0.0, "grad_norm": 5.8125, "kl": 0.00016175458586076275, "learning_rate": 4.2592592592592586e-08, "loss": 0.0, "num_tokens": 13994311.0, "reward": 0.41699835658073425, "reward_std": 0.056771546602249146, "rewards/grpo_reward_func/mean": 0.41699835658073425, "rewards/grpo_reward_func/std": 0.1137986108660698, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.6875, "frac_reward_zero_std": 0.0, "grad_norm": 7.5, "kl": 0.0013509286800399423, "learning_rate": 4.166666666666666e-08, "loss": 0.0001, "num_tokens": 14019495.0, "reward": 0.3929343521595001, "reward_std": 0.14318805932998657, "rewards/grpo_reward_func/mean": 0.3929343521595001, "rewards/grpo_reward_func/std": 0.2004500776529312, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.703125, "frac_reward_zero_std": 0.0, "grad_norm": 10.5625, "kl": 0.0014943527057766914, "learning_rate": 4.0740740740740745e-08, "loss": 0.0001, "num_tokens": 14044207.0, "reward": 0.4945647716522217, "reward_std": 0.1458444595336914, "rewards/grpo_reward_func/mean": 0.4945647716522217, "rewards/grpo_reward_func/std": 0.14586947858333588, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.71875, "frac_reward_zero_std": 0.0, "grad_norm": 13.125, "kl": 0.0004489005805226043, "learning_rate": 3.9814814814814815e-08, "loss": 0.0, "num_tokens": 14069927.0, "reward": 0.07546406239271164, "reward_std": 0.09835518896579742, "rewards/grpo_reward_func/mean": 0.07546406239271164, "rewards/grpo_reward_func/std": 0.09766824543476105, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.734375, "frac_reward_zero_std": 0.0, "grad_norm": 7.96875, "kl": 0.0015613465220667422, "learning_rate": 3.888888888888889e-08, "loss": 0.0001, "num_tokens": 14095207.0, "reward": 0.42614489793777466, "reward_std": 0.12422450631856918, "rewards/grpo_reward_func/mean": 0.42614489793777466, "rewards/grpo_reward_func/std": 0.17880021035671234, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.75, "frac_reward_zero_std": 0.0, "grad_norm": 13.5625, "kl": 0.0030846113804727793, "learning_rate": 3.796296296296296e-08, "loss": 0.0001, "num_tokens": 14120207.0, "reward": 0.2813106179237366, "reward_std": 0.12345054000616074, "rewards/grpo_reward_func/mean": 0.2813106179237366, "rewards/grpo_reward_func/std": 0.1639048308134079, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.765625, "frac_reward_zero_std": 0.0, "grad_norm": 16.375, "kl": 0.0009916585986502469, "learning_rate": 3.7037037037037036e-08, "loss": 0.0, "num_tokens": 14145495.0, "reward": 0.2581431269645691, "reward_std": 0.12751302123069763, "rewards/grpo_reward_func/mean": 0.2581431269645691, "rewards/grpo_reward_func/std": 0.1969458907842636, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.78125, "frac_reward_zero_std": 0.0, "grad_norm": 11.8125, "kl": 0.0007693757943343371, "learning_rate": 3.6111111111111106e-08, "loss": 0.0, "num_tokens": 14170759.0, "reward": 0.3326881229877472, "reward_std": 0.12007895112037659, "rewards/grpo_reward_func/mean": 0.3326881229877472, "rewards/grpo_reward_func/std": 0.1920982152223587, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.796875, "frac_reward_zero_std": 0.0, "grad_norm": 11.0, "kl": 0.0011726654774975032, "learning_rate": 3.518518518518519e-08, "loss": 0.0, "num_tokens": 14196303.0, "reward": 0.08724980056285858, "reward_std": 0.11998534202575684, "rewards/grpo_reward_func/mean": 0.08724980056285858, "rewards/grpo_reward_func/std": 0.1232389286160469, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.8125, "frac_reward_zero_std": 0.0, "grad_norm": 9.75, "kl": 0.00046126171946525574, "learning_rate": 3.425925925925926e-08, "loss": 0.0, "num_tokens": 14221079.0, "reward": 0.3096466362476349, "reward_std": 0.06632121652364731, "rewards/grpo_reward_func/mean": 0.3096466362476349, "rewards/grpo_reward_func/std": 0.10011015087366104, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.828125, "frac_reward_zero_std": 0.0, "grad_norm": 10.9375, "kl": 0.0005951582861598581, "learning_rate": 3.3333333333333334e-08, "loss": 0.0, "num_tokens": 14246583.0, "reward": 0.2074100375175476, "reward_std": 0.24280470609664917, "rewards/grpo_reward_func/mean": 0.2074100375175476, "rewards/grpo_reward_func/std": 0.24377292394638062, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.84375, "frac_reward_zero_std": 0.0, "grad_norm": 10.3125, "kl": 0.0016201141115743667, "learning_rate": 3.2407407407407403e-08, "loss": 0.0001, "num_tokens": 14271647.0, "reward": 0.2839413583278656, "reward_std": 0.10143469274044037, "rewards/grpo_reward_func/mean": 0.2839413583278656, "rewards/grpo_reward_func/std": 0.12659397721290588, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.859375, "frac_reward_zero_std": 0.0, "grad_norm": 9.375, "kl": 0.0010330639779567719, "learning_rate": 3.148148148148148e-08, "loss": 0.0, "num_tokens": 14296583.0, "reward": 0.322548508644104, "reward_std": 0.09603136777877808, "rewards/grpo_reward_func/mean": 0.322548508644104, "rewards/grpo_reward_func/std": 0.09745854139328003, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.875, "frac_reward_zero_std": 0.0, "grad_norm": 11.75, "kl": 0.00181583222001791, "learning_rate": 3.0555555555555556e-08, "loss": 0.0001, "num_tokens": 14321959.0, "reward": 0.20520664751529694, "reward_std": 0.10753442347049713, "rewards/grpo_reward_func/mean": 0.20520664751529694, "rewards/grpo_reward_func/std": 0.21462036669254303, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.890625, "frac_reward_zero_std": 0.0, "grad_norm": 17.5, "kl": 0.002053163305390626, "learning_rate": 2.962962962962963e-08, "loss": 0.0001, "num_tokens": 14347727.0, "reward": 0.07180184125900269, "reward_std": 0.12105913460254669, "rewards/grpo_reward_func/mean": 0.07180184125900269, "rewards/grpo_reward_func/std": 0.11819092184305191, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.90625, "frac_reward_zero_std": 0.0, "grad_norm": 14.0625, "kl": 0.0011411278101149946, "learning_rate": 2.87037037037037e-08, "loss": 0.0, "num_tokens": 14372879.0, "reward": 0.34137165546417236, "reward_std": 0.13372355699539185, "rewards/grpo_reward_func/mean": 0.34137165546417236, "rewards/grpo_reward_func/std": 0.15903595089912415, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.921875, "frac_reward_zero_std": 0.0, "grad_norm": 8.8125, "kl": 0.0012179824407212436, "learning_rate": 2.7777777777777774e-08, "loss": 0.0, "num_tokens": 14397903.0, "reward": 0.32042965292930603, "reward_std": 0.10691528767347336, "rewards/grpo_reward_func/mean": 0.32042965292930603, "rewards/grpo_reward_func/std": 0.13944755494594574, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.9375, "frac_reward_zero_std": 0.0, "grad_norm": 8.9375, "kl": 0.0027775077614933252, "learning_rate": 2.685185185185185e-08, "loss": 0.0001, "num_tokens": 14422735.0, "reward": 0.41792985796928406, "reward_std": 0.18040655553340912, "rewards/grpo_reward_func/mean": 0.41792985796928406, "rewards/grpo_reward_func/std": 0.1744299978017807, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.953125, "frac_reward_zero_std": 0.0, "grad_norm": 7.375, "kl": 0.0006387926114257425, "learning_rate": 2.5925925925925923e-08, "loss": 0.0, "num_tokens": 14448311.0, "reward": 0.20609521865844727, "reward_std": 0.05411346256732941, "rewards/grpo_reward_func/mean": 0.20609521865844727, "rewards/grpo_reward_func/std": 0.19829832017421722, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.96875, "frac_reward_zero_std": 0.0, "grad_norm": 15.0, "kl": 0.0012447184999473393, "learning_rate": 2.5e-08, "loss": 0.0, "num_tokens": 14474111.0, "reward": 0.10689907521009445, "reward_std": 0.11656990647315979, "rewards/grpo_reward_func/mean": 0.10689907521009445, "rewards/grpo_reward_func/std": 0.12091051787137985, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.984375, "frac_reward_zero_std": 0.0, "grad_norm": 12.5625, "kl": 0.0006491482927231118, "learning_rate": 2.407407407407407e-08, "loss": 0.0, "num_tokens": 14498855.0, "reward": 0.34690797328948975, "reward_std": 0.08506066352128983, "rewards/grpo_reward_func/mean": 0.34690797328948975, "rewards/grpo_reward_func/std": 0.0848422721028328, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.0, "frac_reward_zero_std": 0.0, "grad_norm": 8.1875, "kl": 0.0013912487775087357, "learning_rate": 2.3148148148148144e-08, "loss": 0.0001, "num_tokens": 14524343.0, "reward": 0.2778702676296234, "reward_std": 0.08481252193450928, "rewards/grpo_reward_func/mean": 0.2778702676296234, "rewards/grpo_reward_func/std": 0.25761204957962036, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.015625, "frac_reward_zero_std": 0.0, "grad_norm": 10.4375, "kl": 0.0012237662449479103, "learning_rate": 2.222222222222222e-08, "loss": 0.0, "num_tokens": 14549311.0, "reward": 0.4252242147922516, "reward_std": 0.15082389116287231, "rewards/grpo_reward_func/mean": 0.4252242147922516, "rewards/grpo_reward_func/std": 0.19061405956745148, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.03125, "frac_reward_zero_std": 0.0, "grad_norm": 12.75, "kl": 0.001242486119735986, "learning_rate": 2.1296296296296293e-08, "loss": 0.0, "num_tokens": 14574871.0, "reward": 0.1958128809928894, "reward_std": 0.11874909698963165, "rewards/grpo_reward_func/mean": 0.1958128809928894, "rewards/grpo_reward_func/std": 0.19743791222572327, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.046875, "frac_reward_zero_std": 0.0, "grad_norm": 8.5, "kl": 0.0009185175294987857, "learning_rate": 2.0370370370370373e-08, "loss": 0.0, "num_tokens": 14599783.0, "reward": 0.34701311588287354, "reward_std": 0.10938475281000137, "rewards/grpo_reward_func/mean": 0.34701311588287354, "rewards/grpo_reward_func/std": 0.13787339627742767, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.0625, "frac_reward_zero_std": 0.0, "grad_norm": 11.625, "kl": 0.0007825860229786485, "learning_rate": 1.9444444444444445e-08, "loss": 0.0, "num_tokens": 14624775.0, "reward": 0.3385382890701294, "reward_std": 0.12184540182352066, "rewards/grpo_reward_func/mean": 0.3385382890701294, "rewards/grpo_reward_func/std": 0.12013304233551025, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.078125, "frac_reward_zero_std": 0.0, "grad_norm": 7.0, "kl": 0.0004937490448355675, "learning_rate": 1.8518518518518518e-08, "loss": 0.0, "num_tokens": 14649759.0, "reward": 0.3188565969467163, "reward_std": 0.0923345610499382, "rewards/grpo_reward_func/mean": 0.3188565969467163, "rewards/grpo_reward_func/std": 0.1829162985086441, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.09375, "frac_reward_zero_std": 0.0, "grad_norm": 8.75, "kl": 0.0009138956665992737, "learning_rate": 1.7592592592592594e-08, "loss": 0.0, "num_tokens": 14675047.0, "reward": 0.2307702898979187, "reward_std": 0.08643310517072678, "rewards/grpo_reward_func/mean": 0.2307702898979187, "rewards/grpo_reward_func/std": 0.11515053361654282, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.109375, "frac_reward_zero_std": 0.0, "grad_norm": 21.125, "kl": 0.0005873590707778931, "learning_rate": 1.6666666666666667e-08, "loss": 0.0, "num_tokens": 14699943.0, "reward": 0.39582157135009766, "reward_std": 0.12089787423610687, "rewards/grpo_reward_func/mean": 0.39582157135009766, "rewards/grpo_reward_func/std": 0.1251089870929718, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.125, "frac_reward_zero_std": 0.0, "grad_norm": 12.125, "kl": 0.00180981180164963, "learning_rate": 1.574074074074074e-08, "loss": 0.0001, "num_tokens": 14724911.0, "reward": 0.42574357986450195, "reward_std": 0.12509074807167053, "rewards/grpo_reward_func/mean": 0.42574357986450195, "rewards/grpo_reward_func/std": 0.13018791377544403, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.140625, "frac_reward_zero_std": 0.0, "grad_norm": 9.6875, "kl": 0.0008814459142740816, "learning_rate": 1.4814814814814814e-08, "loss": 0.0, "num_tokens": 14750199.0, "reward": 0.21241606771945953, "reward_std": 0.0747273787856102, "rewards/grpo_reward_func/mean": 0.21241606771945953, "rewards/grpo_reward_func/std": 0.1790691465139389, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.15625, "frac_reward_zero_std": 0.0, "grad_norm": 10.3125, "kl": 0.0019009755342267454, "learning_rate": 1.3888888888888887e-08, "loss": 0.0001, "num_tokens": 14775255.0, "reward": 0.43149012327194214, "reward_std": 0.11503390967845917, "rewards/grpo_reward_func/mean": 0.43149012327194214, "rewards/grpo_reward_func/std": 0.14789779484272003, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.171875, "frac_reward_zero_std": 0.0, "grad_norm": 11.5, "kl": 0.0011376099428161979, "learning_rate": 1.2962962962962961e-08, "loss": 0.0, "num_tokens": 14800007.0, "reward": 0.42513400316238403, "reward_std": 0.10579125583171844, "rewards/grpo_reward_func/mean": 0.42513400316238403, "rewards/grpo_reward_func/std": 0.10767898708581924, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.1875, "frac_reward_zero_std": 0.0, "grad_norm": 7.4375, "kl": 0.0003657530469354242, "learning_rate": 1.2037037037037036e-08, "loss": 0.0, "num_tokens": 14825775.0, "reward": 0.16337397694587708, "reward_std": 0.1274218112230301, "rewards/grpo_reward_func/mean": 0.16337397694587708, "rewards/grpo_reward_func/std": 0.15495631098747253, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.203125, "frac_reward_zero_std": 0.0, "grad_norm": 11.375, "kl": 0.0011066117731388658, "learning_rate": 1.111111111111111e-08, "loss": 0.0, "num_tokens": 14850807.0, "reward": 0.3153996467590332, "reward_std": 0.12657006084918976, "rewards/grpo_reward_func/mean": 0.3153996467590332, "rewards/grpo_reward_func/std": 0.17775346338748932, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.21875, "frac_reward_zero_std": 0.0, "grad_norm": 8.375, "kl": 0.0005893185880267993, "learning_rate": 1.0185185185185186e-08, "loss": 0.0, "num_tokens": 14876407.0, "reward": 0.07492673397064209, "reward_std": 0.09368358552455902, "rewards/grpo_reward_func/mean": 0.07492673397064209, "rewards/grpo_reward_func/std": 0.09934189170598984, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.234375, "frac_reward_zero_std": 0.0, "grad_norm": 15.8125, "kl": 0.0015896050026640296, "learning_rate": 9.259259259259259e-09, "loss": 0.0001, "num_tokens": 14901791.0, "reward": 0.21189236640930176, "reward_std": 0.20999035239219666, "rewards/grpo_reward_func/mean": 0.21189236640930176, "rewards/grpo_reward_func/std": 0.22601766884326935, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.25, "frac_reward_zero_std": 0.0, "grad_norm": 8.5, "kl": 0.0007123357499949634, "learning_rate": 8.333333333333334e-09, "loss": 0.0, "num_tokens": 14926863.0, "reward": 0.4173913598060608, "reward_std": 0.10256533324718475, "rewards/grpo_reward_func/mean": 0.4173913598060608, "rewards/grpo_reward_func/std": 0.11102120578289032, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.265625, "frac_reward_zero_std": 0.0, "grad_norm": 10.9375, "kl": 0.0007044219819363207, "learning_rate": 7.407407407407407e-09, "loss": 0.0, "num_tokens": 14952511.0, "reward": 0.20125332474708557, "reward_std": 0.11327949911355972, "rewards/grpo_reward_func/mean": 0.20125332474708557, "rewards/grpo_reward_func/std": 0.2046782523393631, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.28125, "frac_reward_zero_std": 0.0, "grad_norm": 10.8125, "kl": 0.000963706843322143, "learning_rate": 6.481481481481481e-09, "loss": 0.0, "num_tokens": 14977439.0, "reward": 0.41489923000335693, "reward_std": 0.1567956805229187, "rewards/grpo_reward_func/mean": 0.41489923000335693, "rewards/grpo_reward_func/std": 0.15223918855190277, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.296875, "frac_reward_zero_std": 0.0, "grad_norm": 12.125, "kl": 0.0006893044337630272, "learning_rate": 5.555555555555555e-09, "loss": 0.0, "num_tokens": 15002111.0, "reward": 0.601151704788208, "reward_std": 0.12341496348381042, "rewards/grpo_reward_func/mean": 0.601151704788208, "rewards/grpo_reward_func/std": 0.1384855955839157, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.3125, "frac_reward_zero_std": 0.0, "grad_norm": 12.875, "kl": 0.001145336776971817, "learning_rate": 4.6296296296296295e-09, "loss": 0.0, "num_tokens": 15027239.0, "reward": 0.19861392676830292, "reward_std": 0.1082070991396904, "rewards/grpo_reward_func/mean": 0.19861392676830292, "rewards/grpo_reward_func/std": 0.15457463264465332, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.328125, "frac_reward_zero_std": 0.0, "grad_norm": 12.5625, "kl": 0.0013100424548611045, "learning_rate": 3.7037037037037036e-09, "loss": 0.0001, "num_tokens": 15052759.0, "reward": 0.15435832738876343, "reward_std": 0.17842328548431396, "rewards/grpo_reward_func/mean": 0.15435832738876343, "rewards/grpo_reward_func/std": 0.17921017110347748, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.34375, "frac_reward_zero_std": 0.0, "grad_norm": 10.9375, "kl": 0.0014240065356716514, "learning_rate": 2.7777777777777776e-09, "loss": 0.0001, "num_tokens": 15078943.0, "reward": 0.0750146359205246, "reward_std": 0.1408962607383728, "rewards/grpo_reward_func/mean": 0.0750146359205246, "rewards/grpo_reward_func/std": 0.14366577565670013, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.359375, "frac_reward_zero_std": 0.0, "grad_norm": 5.75, "kl": 0.0006565783696714789, "learning_rate": 1.8518518518518518e-09, "loss": 0.0, "num_tokens": 15104375.0, "reward": 0.21018442511558533, "reward_std": 0.062313079833984375, "rewards/grpo_reward_func/mean": 0.21018442511558533, "rewards/grpo_reward_func/std": 0.10198579728603363, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.375, "frac_reward_zero_std": 0.0, "grad_norm": 14.375, "kl": 0.001865773752797395, "learning_rate": 9.259259259259259e-10, "loss": 0.0001, "num_tokens": 15129007.0, "reward": 0.5128255486488342, "reward_std": 0.1550142616033554, "rewards/grpo_reward_func/mean": 0.5128255486488342, "rewards/grpo_reward_func/std": 0.1619613915681839, "step": 600 } ], "logging_steps": 1, "max_steps": 600, "num_input_tokens_seen": 15129007, "num_train_epochs": 10, "save_steps": 600, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }