Files
brainrl-grpo-single-m/checkpoint-300/trainer_state.json
ModelHub XC 89105f84cb 初始化项目,由ModelHub XC社区提供模型
Model: Mohith202/brainrl-grpo-single-m
Source: Original Platform
2026-04-29 19:17:17 +08:00

7535 lines
258 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.375,
"eval_steps": 500,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.03125,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.3125,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 12528.0,
"reward": 0.39676433801651,
"reward_std": 0.11280547827482224,
"rewards/grpo_reward_func/mean": 0.39676433801651,
"rewards/grpo_reward_func/std": 0.13478560745716095,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.0625,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.8125,
"learning_rate": 9.966666666666667e-07,
"loss": 0.0,
"num_tokens": 24884.0,
"reward": 0.4752987027168274,
"reward_std": 0.13702644407749176,
"rewards/grpo_reward_func/mean": 0.4752987027168274,
"rewards/grpo_reward_func/std": 0.17374587059020996,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.09375,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.625,
"learning_rate": 9.933333333333333e-07,
"loss": -0.0,
"num_tokens": 37352.0,
"reward": 0.44525083899497986,
"reward_std": 0.10103905200958252,
"rewards/grpo_reward_func/mean": 0.44525083899497986,
"rewards/grpo_reward_func/std": 0.0979275107383728,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.125,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.125,
"learning_rate": 9.9e-07,
"loss": 0.0,
"num_tokens": 49744.0,
"reward": 0.399270236492157,
"reward_std": 0.10935800522565842,
"rewards/grpo_reward_func/mean": 0.399270236492157,
"rewards/grpo_reward_func/std": 0.10536573082208633,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.15625,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.75,
"learning_rate": 9.866666666666666e-07,
"loss": 0.0,
"num_tokens": 62224.0,
"reward": 0.3989260196685791,
"reward_std": 0.11544467508792877,
"rewards/grpo_reward_func/mean": 0.3989260196685791,
"rewards/grpo_reward_func/std": 0.11394146084785461,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1875,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.125,
"learning_rate": 9.833333333333332e-07,
"loss": -0.0,
"num_tokens": 74736.0,
"reward": 0.42444688081741333,
"reward_std": 0.14600424468517303,
"rewards/grpo_reward_func/mean": 0.42444688081741333,
"rewards/grpo_reward_func/std": 0.17498743534088135,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.21875,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.125,
"learning_rate": 9.8e-07,
"loss": 0.0,
"num_tokens": 87100.0,
"reward": 0.4266095757484436,
"reward_std": 0.0954706147313118,
"rewards/grpo_reward_func/mean": 0.4266095757484436,
"rewards/grpo_reward_func/std": 0.09790605306625366,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.25,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.5,
"learning_rate": 9.766666666666667e-07,
"loss": -0.0,
"num_tokens": 99496.0,
"reward": 0.4947161376476288,
"reward_std": 0.07030671834945679,
"rewards/grpo_reward_func/mean": 0.4947161376476288,
"rewards/grpo_reward_func/std": 0.07488483190536499,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.28125,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.8125,
"learning_rate": 9.733333333333333e-07,
"loss": -0.0,
"num_tokens": 111844.0,
"reward": 0.4835072159767151,
"reward_std": 0.1621960997581482,
"rewards/grpo_reward_func/mean": 0.4835072159767151,
"rewards/grpo_reward_func/std": 0.17284278571605682,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.3125,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.0,
"learning_rate": 9.7e-07,
"loss": -0.0,
"num_tokens": 124240.0,
"reward": 0.4783210754394531,
"reward_std": 0.09915027022361755,
"rewards/grpo_reward_func/mean": 0.4783210754394531,
"rewards/grpo_reward_func/std": 0.11161749064922333,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.34375,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.6875,
"learning_rate": 9.666666666666666e-07,
"loss": -0.0,
"num_tokens": 136652.0,
"reward": 0.40330448746681213,
"reward_std": 0.10881966352462769,
"rewards/grpo_reward_func/mean": 0.40330448746681213,
"rewards/grpo_reward_func/std": 0.1156788170337677,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.375,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.75,
"learning_rate": 9.633333333333334e-07,
"loss": -0.0,
"num_tokens": 149048.0,
"reward": 0.41300415992736816,
"reward_std": 0.13600921630859375,
"rewards/grpo_reward_func/mean": 0.41300415992736816,
"rewards/grpo_reward_func/std": 0.1646273136138916,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.40625,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.625,
"learning_rate": 9.6e-07,
"loss": 0.0,
"num_tokens": 161276.0,
"reward": 0.4857324957847595,
"reward_std": 0.09516896307468414,
"rewards/grpo_reward_func/mean": 0.4857324957847595,
"rewards/grpo_reward_func/std": 0.09173914790153503,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.4375,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.8125,
"learning_rate": 9.566666666666667e-07,
"loss": 0.0,
"num_tokens": 173780.0,
"reward": 0.4015089273452759,
"reward_std": 0.06604111194610596,
"rewards/grpo_reward_func/mean": 0.4015089273452759,
"rewards/grpo_reward_func/std": 0.07018419355154037,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.46875,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.6875,
"learning_rate": 9.533333333333333e-07,
"loss": -0.0,
"num_tokens": 186192.0,
"reward": 0.31999891996383667,
"reward_std": 0.0805739015340805,
"rewards/grpo_reward_func/mean": 0.31999891996383667,
"rewards/grpo_reward_func/std": 0.08632533997297287,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.5,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.1875,
"learning_rate": 9.499999999999999e-07,
"loss": -0.0,
"num_tokens": 198684.0,
"reward": 0.39560186862945557,
"reward_std": 0.09632067382335663,
"rewards/grpo_reward_func/mean": 0.39560186862945557,
"rewards/grpo_reward_func/std": 0.09369846433401108,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.53125,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.5,
"learning_rate": 9.466666666666666e-07,
"loss": 0.0,
"num_tokens": 211096.0,
"reward": 0.48571068048477173,
"reward_std": 0.15206970274448395,
"rewards/grpo_reward_func/mean": 0.48571068048477173,
"rewards/grpo_reward_func/std": 0.1438637524843216,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.5625,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.09375,
"learning_rate": 9.433333333333333e-07,
"loss": -0.0,
"num_tokens": 223552.0,
"reward": 0.45060235261917114,
"reward_std": 0.05437461659312248,
"rewards/grpo_reward_func/mean": 0.45060235261917114,
"rewards/grpo_reward_func/std": 0.140779510140419,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.59375,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.78125,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0,
"num_tokens": 236036.0,
"reward": 0.4261874556541443,
"reward_std": 0.09510611742734909,
"rewards/grpo_reward_func/mean": 0.4261874556541443,
"rewards/grpo_reward_func/std": 0.10084228217601776,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.625,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.375,
"learning_rate": 9.366666666666666e-07,
"loss": -0.0,
"num_tokens": 248448.0,
"reward": 0.29703885316848755,
"reward_std": 0.046393271535634995,
"rewards/grpo_reward_func/mean": 0.29703885316848755,
"rewards/grpo_reward_func/std": 0.04335997626185417,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.65625,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.0625,
"learning_rate": 9.333333333333333e-07,
"loss": 0.0,
"num_tokens": 260776.0,
"reward": 0.45774269104003906,
"reward_std": 0.16561079025268555,
"rewards/grpo_reward_func/mean": 0.45774269104003906,
"rewards/grpo_reward_func/std": 0.15406657755374908,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.6875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.125,
"learning_rate": 9.3e-07,
"loss": 0.0,
"num_tokens": 273248.0,
"reward": 0.4235140085220337,
"reward_std": 0.06906857341527939,
"rewards/grpo_reward_func/mean": 0.4235140085220337,
"rewards/grpo_reward_func/std": 0.07242283225059509,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.71875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.25,
"learning_rate": 9.266666666666665e-07,
"loss": 0.0,
"num_tokens": 285724.0,
"reward": 0.36918026208877563,
"reward_std": 0.06028338894248009,
"rewards/grpo_reward_func/mean": 0.36918026208877563,
"rewards/grpo_reward_func/std": 0.0693485215306282,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.75,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.0625,
"learning_rate": 9.233333333333333e-07,
"loss": 0.0,
"num_tokens": 298132.0,
"reward": 0.3204312324523926,
"reward_std": 0.07052356004714966,
"rewards/grpo_reward_func/mean": 0.3204312324523926,
"rewards/grpo_reward_func/std": 0.09546414762735367,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.78125,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.0625,
"learning_rate": 9.2e-07,
"loss": -0.0,
"num_tokens": 310584.0,
"reward": 0.38078033924102783,
"reward_std": 0.13373351097106934,
"rewards/grpo_reward_func/mean": 0.38078033924102783,
"rewards/grpo_reward_func/std": 0.13402824103832245,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.8125,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.6875,
"learning_rate": 9.166666666666665e-07,
"loss": 0.0,
"num_tokens": 323076.0,
"reward": 0.3454480767250061,
"reward_std": 0.10349850356578827,
"rewards/grpo_reward_func/mean": 0.3454480767250061,
"rewards/grpo_reward_func/std": 0.12671217322349548,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.84375,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.96875,
"learning_rate": 9.133333333333333e-07,
"loss": 0.0,
"num_tokens": 335520.0,
"reward": 0.3619287312030792,
"reward_std": 0.12553678452968597,
"rewards/grpo_reward_func/mean": 0.3619287312030792,
"rewards/grpo_reward_func/std": 0.1537715494632721,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.875,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.9375,
"learning_rate": 9.1e-07,
"loss": 0.0,
"num_tokens": 347940.0,
"reward": 0.3436325788497925,
"reward_std": 0.09887667000293732,
"rewards/grpo_reward_func/mean": 0.3436325788497925,
"rewards/grpo_reward_func/std": 0.12251166999340057,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.90625,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.0,
"learning_rate": 9.066666666666665e-07,
"loss": -0.0,
"num_tokens": 360388.0,
"reward": 0.4369204044342041,
"reward_std": 0.19640696048736572,
"rewards/grpo_reward_func/mean": 0.4369204044342041,
"rewards/grpo_reward_func/std": 0.1927463412284851,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.9375,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.8125,
"learning_rate": 9.033333333333333e-07,
"loss": 0.0,
"num_tokens": 372832.0,
"reward": 0.4874047338962555,
"reward_std": 0.053364820778369904,
"rewards/grpo_reward_func/mean": 0.4874047338962555,
"rewards/grpo_reward_func/std": 0.08248723298311234,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.96875,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.78125,
"learning_rate": 9e-07,
"loss": -0.0,
"num_tokens": 385256.0,
"reward": 0.4391651451587677,
"reward_std": 0.07597412914037704,
"rewards/grpo_reward_func/mean": 0.4391651451587677,
"rewards/grpo_reward_func/std": 0.13502921164035797,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.5,
"learning_rate": 8.966666666666666e-07,
"loss": 0.0,
"num_tokens": 397784.0,
"reward": 0.552140474319458,
"reward_std": 0.1218448132276535,
"rewards/grpo_reward_func/mean": 0.552140474319458,
"rewards/grpo_reward_func/std": 0.11282333731651306,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.03125,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.5625,
"learning_rate": 8.933333333333333e-07,
"loss": -0.0,
"num_tokens": 410304.0,
"reward": 0.4041430950164795,
"reward_std": 0.1936928927898407,
"rewards/grpo_reward_func/mean": 0.4041430950164795,
"rewards/grpo_reward_func/std": 0.18484662473201752,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.0625,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.90625,
"learning_rate": 8.9e-07,
"loss": -0.0,
"num_tokens": 422796.0,
"reward": 0.41248780488967896,
"reward_std": 0.15024888515472412,
"rewards/grpo_reward_func/mean": 0.41248780488967896,
"rewards/grpo_reward_func/std": 0.16827252507209778,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.09375,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.625,
"learning_rate": 8.866666666666667e-07,
"loss": 0.0,
"num_tokens": 435260.0,
"reward": 0.4898865818977356,
"reward_std": 0.11311106383800507,
"rewards/grpo_reward_func/mean": 0.4898865818977356,
"rewards/grpo_reward_func/std": 0.11546135693788528,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.125,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.25,
"learning_rate": 8.833333333333333e-07,
"loss": -0.0,
"num_tokens": 447656.0,
"reward": 0.402587354183197,
"reward_std": 0.07555107772350311,
"rewards/grpo_reward_func/mean": 0.402587354183197,
"rewards/grpo_reward_func/std": 0.07951883971691132,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.15625,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.25,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0,
"num_tokens": 460100.0,
"reward": 0.4937467575073242,
"reward_std": 0.11035488545894623,
"rewards/grpo_reward_func/mean": 0.4937467575073242,
"rewards/grpo_reward_func/std": 0.11266050487756729,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.1875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.875,
"learning_rate": 8.766666666666667e-07,
"loss": -0.0,
"num_tokens": 472580.0,
"reward": 0.42728495597839355,
"reward_std": 0.05418732762336731,
"rewards/grpo_reward_func/mean": 0.42728495597839355,
"rewards/grpo_reward_func/std": 0.05117730051279068,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.21875,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.9375,
"learning_rate": 8.733333333333333e-07,
"loss": -0.0,
"num_tokens": 485016.0,
"reward": 0.3464398980140686,
"reward_std": 0.05486953258514404,
"rewards/grpo_reward_func/mean": 0.3464398980140686,
"rewards/grpo_reward_func/std": 0.10943454504013062,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.25,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.40625,
"learning_rate": 8.699999999999999e-07,
"loss": -0.0,
"num_tokens": 497416.0,
"reward": 0.43631184101104736,
"reward_std": 0.09718433767557144,
"rewards/grpo_reward_func/mean": 0.43631184101104736,
"rewards/grpo_reward_func/std": 0.17311933636665344,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.28125,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.78125,
"learning_rate": 8.666666666666667e-07,
"loss": -0.0,
"num_tokens": 509832.0,
"reward": 0.5329959392547607,
"reward_std": 0.11580680310726166,
"rewards/grpo_reward_func/mean": 0.5329959392547607,
"rewards/grpo_reward_func/std": 0.11687568575143814,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.3125,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.75,
"learning_rate": 8.633333333333333e-07,
"loss": -0.0,
"num_tokens": 522252.0,
"reward": 0.44177818298339844,
"reward_std": 0.13238248229026794,
"rewards/grpo_reward_func/mean": 0.44177818298339844,
"rewards/grpo_reward_func/std": 0.12943537533283234,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.34375,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.375,
"learning_rate": 8.599999999999999e-07,
"loss": -0.0,
"num_tokens": 534660.0,
"reward": 0.5416427850723267,
"reward_std": 0.09374570846557617,
"rewards/grpo_reward_func/mean": 0.5416427850723267,
"rewards/grpo_reward_func/std": 0.11684079468250275,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.375,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.125,
"learning_rate": 8.566666666666667e-07,
"loss": 0.0,
"num_tokens": 547064.0,
"reward": 0.3880234658718109,
"reward_std": 0.06982941925525665,
"rewards/grpo_reward_func/mean": 0.3880234658718109,
"rewards/grpo_reward_func/std": 0.09098156541585922,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.40625,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.34375,
"learning_rate": 8.533333333333334e-07,
"loss": -0.0,
"num_tokens": 559488.0,
"reward": 0.33481428027153015,
"reward_std": 0.06352214515209198,
"rewards/grpo_reward_func/mean": 0.33481428027153015,
"rewards/grpo_reward_func/std": 0.08472999185323715,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.4375,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.0625,
"learning_rate": 8.499999999999999e-07,
"loss": 0.0,
"num_tokens": 571944.0,
"reward": 0.387703001499176,
"reward_std": 0.07385663688182831,
"rewards/grpo_reward_func/mean": 0.387703001499176,
"rewards/grpo_reward_func/std": 0.11046246439218521,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.46875,
"frac_reward_zero_std": 0.0,
"grad_norm": 15.125,
"learning_rate": 8.466666666666667e-07,
"loss": 0.0,
"num_tokens": 584324.0,
"reward": 0.5441805124282837,
"reward_std": 0.11389695107936859,
"rewards/grpo_reward_func/mean": 0.5441805124282837,
"rewards/grpo_reward_func/std": 0.13207265734672546,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.5,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.8125,
"learning_rate": 8.433333333333333e-07,
"loss": 0.0,
"num_tokens": 596692.0,
"reward": 0.488021582365036,
"reward_std": 0.13947440683841705,
"rewards/grpo_reward_func/mean": 0.488021582365036,
"rewards/grpo_reward_func/std": 0.15811356902122498,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.53125,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.421875,
"learning_rate": 8.399999999999999e-07,
"loss": -0.0,
"num_tokens": 609168.0,
"reward": 0.3698539733886719,
"reward_std": 0.04929333180189133,
"rewards/grpo_reward_func/mean": 0.3698539733886719,
"rewards/grpo_reward_func/std": 0.05231497436761856,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.5625,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.59375,
"learning_rate": 8.366666666666667e-07,
"loss": -0.0,
"num_tokens": 621624.0,
"reward": 0.46477562189102173,
"reward_std": 0.07750491052865982,
"rewards/grpo_reward_func/mean": 0.46477562189102173,
"rewards/grpo_reward_func/std": 0.15642288327217102,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.59375,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.0,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0,
"num_tokens": 633868.0,
"reward": 0.43864211440086365,
"reward_std": 0.13110151886940002,
"rewards/grpo_reward_func/mean": 0.43864211440086365,
"rewards/grpo_reward_func/std": 0.14933471381664276,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.625,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.625,
"learning_rate": 8.299999999999999e-07,
"loss": 0.0,
"num_tokens": 646324.0,
"reward": 0.3448641300201416,
"reward_std": 0.06778337061405182,
"rewards/grpo_reward_func/mean": 0.3448641300201416,
"rewards/grpo_reward_func/std": 0.06967282295227051,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.65625,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.9375,
"learning_rate": 8.266666666666667e-07,
"loss": 0.0,
"num_tokens": 658764.0,
"reward": 0.4265494644641876,
"reward_std": 0.11092057079076767,
"rewards/grpo_reward_func/mean": 0.4265494644641876,
"rewards/grpo_reward_func/std": 0.11681105941534042,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.6875,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.3125,
"learning_rate": 8.233333333333333e-07,
"loss": 0.0,
"num_tokens": 671084.0,
"reward": 0.3909933567047119,
"reward_std": 0.062042489647865295,
"rewards/grpo_reward_func/mean": 0.3909933567047119,
"rewards/grpo_reward_func/std": 0.12040998041629791,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.71875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.6875,
"learning_rate": 8.199999999999999e-07,
"loss": -0.0,
"num_tokens": 683556.0,
"reward": 0.3827119469642639,
"reward_std": 0.0810474157333374,
"rewards/grpo_reward_func/mean": 0.3827119469642639,
"rewards/grpo_reward_func/std": 0.10648734867572784,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.75,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.625,
"learning_rate": 8.166666666666666e-07,
"loss": 0.0,
"num_tokens": 696044.0,
"reward": 0.43536075949668884,
"reward_std": 0.13194429874420166,
"rewards/grpo_reward_func/mean": 0.43536075949668884,
"rewards/grpo_reward_func/std": 0.14542116224765778,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.78125,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.5625,
"learning_rate": 8.133333333333333e-07,
"loss": -0.0,
"num_tokens": 708468.0,
"reward": 0.4286166727542877,
"reward_std": 0.07387880980968475,
"rewards/grpo_reward_func/mean": 0.4286166727542877,
"rewards/grpo_reward_func/std": 0.10452007502317429,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.8125,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.5625,
"learning_rate": 8.1e-07,
"loss": -0.0,
"num_tokens": 720940.0,
"reward": 0.38893401622772217,
"reward_std": 0.0943751409649849,
"rewards/grpo_reward_func/mean": 0.38893401622772217,
"rewards/grpo_reward_func/std": 0.12028432637453079,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.84375,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.1875,
"learning_rate": 8.066666666666666e-07,
"loss": -0.0,
"num_tokens": 733360.0,
"reward": 0.4644596576690674,
"reward_std": 0.16205663979053497,
"rewards/grpo_reward_func/mean": 0.4644596576690674,
"rewards/grpo_reward_func/std": 0.15505553781986237,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.1875,
"learning_rate": 8.033333333333333e-07,
"loss": 0.0,
"num_tokens": 745704.0,
"reward": 0.46369504928588867,
"reward_std": 0.0912257730960846,
"rewards/grpo_reward_func/mean": 0.46369504928588867,
"rewards/grpo_reward_func/std": 0.09050611406564713,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.90625,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.84375,
"learning_rate": 8e-07,
"loss": -0.0,
"num_tokens": 758080.0,
"reward": 0.4551791548728943,
"reward_std": 0.12297463417053223,
"rewards/grpo_reward_func/mean": 0.4551791548728943,
"rewards/grpo_reward_func/std": 0.14138628542423248,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.9375,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.0,
"learning_rate": 7.966666666666666e-07,
"loss": 0.0,
"num_tokens": 770712.0,
"reward": 0.39730104804039,
"reward_std": 0.06629657000303268,
"rewards/grpo_reward_func/mean": 0.39730104804039,
"rewards/grpo_reward_func/std": 0.08781840652227402,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 1.96875,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.625,
"learning_rate": 7.933333333333333e-07,
"loss": 0.0,
"num_tokens": 783120.0,
"reward": 0.40575429797172546,
"reward_std": 0.09323962777853012,
"rewards/grpo_reward_func/mean": 0.40575429797172546,
"rewards/grpo_reward_func/std": 0.1281837671995163,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.0,
"learning_rate": 7.9e-07,
"loss": 0.0,
"num_tokens": 795568.0,
"reward": 0.4539070129394531,
"reward_std": 0.1893976330757141,
"rewards/grpo_reward_func/mean": 0.4539070129394531,
"rewards/grpo_reward_func/std": 0.17878401279449463,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.03125,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.625,
"learning_rate": 7.866666666666666e-07,
"loss": 0.0,
"num_tokens": 808072.0,
"reward": 0.42031583189964294,
"reward_std": 0.06885866075754166,
"rewards/grpo_reward_func/mean": 0.42031583189964294,
"rewards/grpo_reward_func/std": 0.06720545887947083,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.0625,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.9375,
"learning_rate": 7.833333333333333e-07,
"loss": -0.0,
"num_tokens": 820592.0,
"reward": 0.446481317281723,
"reward_std": 0.06617365032434464,
"rewards/grpo_reward_func/mean": 0.446481317281723,
"rewards/grpo_reward_func/std": 0.11224810034036636,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.09375,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.0625,
"learning_rate": 7.799999999999999e-07,
"loss": -0.0,
"num_tokens": 833008.0,
"reward": 0.29850703477859497,
"reward_std": 0.07878842949867249,
"rewards/grpo_reward_func/mean": 0.29850703477859497,
"rewards/grpo_reward_func/std": 0.09381019324064255,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.125,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.8125,
"learning_rate": 7.766666666666666e-07,
"loss": 0.0,
"num_tokens": 845476.0,
"reward": 0.3700866401195526,
"reward_std": 0.11176452040672302,
"rewards/grpo_reward_func/mean": 0.3700866401195526,
"rewards/grpo_reward_func/std": 0.1271413266658783,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.15625,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.75,
"learning_rate": 7.733333333333333e-07,
"loss": -0.0,
"num_tokens": 857896.0,
"reward": 0.4782499074935913,
"reward_std": 0.10448910295963287,
"rewards/grpo_reward_func/mean": 0.4782499074935913,
"rewards/grpo_reward_func/std": 0.125322625041008,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.1875,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.5,
"learning_rate": 7.699999999999999e-07,
"loss": -0.0,
"num_tokens": 870308.0,
"reward": 0.44694995880126953,
"reward_std": 0.11892125755548477,
"rewards/grpo_reward_func/mean": 0.44694995880126953,
"rewards/grpo_reward_func/std": 0.15172399580478668,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.21875,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.25,
"learning_rate": 7.666666666666667e-07,
"loss": -0.0,
"num_tokens": 882696.0,
"reward": 0.48773661255836487,
"reward_std": 0.18720099329948425,
"rewards/grpo_reward_func/mean": 0.48773661255836487,
"rewards/grpo_reward_func/std": 0.19652612507343292,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.25,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.21875,
"learning_rate": 7.633333333333333e-07,
"loss": 0.0,
"num_tokens": 895208.0,
"reward": 0.360309362411499,
"reward_std": 0.05594930052757263,
"rewards/grpo_reward_func/mean": 0.360309362411499,
"rewards/grpo_reward_func/std": 0.08431853353977203,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.28125,
"frac_reward_zero_std": 0.0,
"grad_norm": 15.3125,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0,
"num_tokens": 907548.0,
"reward": 0.4548572897911072,
"reward_std": 0.1430705040693283,
"rewards/grpo_reward_func/mean": 0.4548572897911072,
"rewards/grpo_reward_func/std": 0.144826740026474,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.3125,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.40625,
"learning_rate": 7.566666666666667e-07,
"loss": -0.0,
"num_tokens": 919976.0,
"reward": 0.43647801876068115,
"reward_std": 0.10883159935474396,
"rewards/grpo_reward_func/mean": 0.43647801876068115,
"rewards/grpo_reward_func/std": 0.13386793434619904,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.34375,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.96875,
"learning_rate": 7.533333333333332e-07,
"loss": 0.0,
"num_tokens": 932436.0,
"reward": 0.3631000518798828,
"reward_std": 0.055175162851810455,
"rewards/grpo_reward_func/mean": 0.3631000518798828,
"rewards/grpo_reward_func/std": 0.061299730092287064,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.375,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.625,
"learning_rate": 7.5e-07,
"loss": -0.0,
"num_tokens": 944744.0,
"reward": 0.3734683394432068,
"reward_std": 0.07731673121452332,
"rewards/grpo_reward_func/mean": 0.3734683394432068,
"rewards/grpo_reward_func/std": 0.1018432006239891,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.40625,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.75,
"learning_rate": 7.466666666666667e-07,
"loss": -0.0,
"num_tokens": 957140.0,
"reward": 0.3586929738521576,
"reward_std": 0.08576677739620209,
"rewards/grpo_reward_func/mean": 0.3586929738521576,
"rewards/grpo_reward_func/std": 0.09627655893564224,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.4375,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.1875,
"learning_rate": 7.433333333333332e-07,
"loss": -0.0,
"num_tokens": 969588.0,
"reward": 0.3304125964641571,
"reward_std": 0.09432289004325867,
"rewards/grpo_reward_func/mean": 0.3304125964641571,
"rewards/grpo_reward_func/std": 0.12439437210559845,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.46875,
"frac_reward_zero_std": 0.0,
"grad_norm": 15.625,
"learning_rate": 7.4e-07,
"loss": -0.0,
"num_tokens": 982032.0,
"reward": 0.4600115418434143,
"reward_std": 0.11891645193099976,
"rewards/grpo_reward_func/mean": 0.4600115418434143,
"rewards/grpo_reward_func/std": 0.11769349873065948,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.5,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.5625,
"learning_rate": 7.366666666666667e-07,
"loss": 0.0,
"num_tokens": 994440.0,
"reward": 0.4921344816684723,
"reward_std": 0.18801572918891907,
"rewards/grpo_reward_func/mean": 0.4921344816684723,
"rewards/grpo_reward_func/std": 0.17593181133270264,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.53125,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.03125,
"learning_rate": 7.333333333333332e-07,
"loss": -0.0,
"num_tokens": 1006908.0,
"reward": 0.44369810819625854,
"reward_std": 0.11731548607349396,
"rewards/grpo_reward_func/mean": 0.44369810819625854,
"rewards/grpo_reward_func/std": 0.13351494073867798,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.5625,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.40625,
"learning_rate": 7.3e-07,
"loss": -0.0,
"num_tokens": 1019360.0,
"reward": 0.4988051652908325,
"reward_std": 0.08421847224235535,
"rewards/grpo_reward_func/mean": 0.4988051652908325,
"rewards/grpo_reward_func/std": 0.12857672572135925,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.59375,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.1875,
"learning_rate": 7.266666666666667e-07,
"loss": 0.0,
"num_tokens": 1031756.0,
"reward": 0.4094837009906769,
"reward_std": 0.10778755694627762,
"rewards/grpo_reward_func/mean": 0.4094837009906769,
"rewards/grpo_reward_func/std": 0.11033328622579575,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.625,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.375,
"learning_rate": 7.233333333333333e-07,
"loss": -0.0,
"num_tokens": 1044160.0,
"reward": 0.3499518632888794,
"reward_std": 0.07542143762111664,
"rewards/grpo_reward_func/mean": 0.3499518632888794,
"rewards/grpo_reward_func/std": 0.08578986674547195,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.65625,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.875,
"learning_rate": 7.2e-07,
"loss": -0.0,
"num_tokens": 1056580.0,
"reward": 0.4997272491455078,
"reward_std": 0.1262975037097931,
"rewards/grpo_reward_func/mean": 0.4997272491455078,
"rewards/grpo_reward_func/std": 0.1279306709766388,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.6875,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.0625,
"learning_rate": 7.166666666666667e-07,
"loss": 0.0,
"num_tokens": 1069020.0,
"reward": 0.46792131662368774,
"reward_std": 0.13234254717826843,
"rewards/grpo_reward_func/mean": 0.46792131662368774,
"rewards/grpo_reward_func/std": 0.1700320839881897,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.71875,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.375,
"learning_rate": 7.133333333333333e-07,
"loss": -0.0,
"num_tokens": 1081496.0,
"reward": 0.4166927635669708,
"reward_std": 0.07564548403024673,
"rewards/grpo_reward_func/mean": 0.4166927635669708,
"rewards/grpo_reward_func/std": 0.19586633145809174,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.75,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.5,
"learning_rate": 7.1e-07,
"loss": 0.0,
"num_tokens": 1093816.0,
"reward": 0.42825716733932495,
"reward_std": 0.1293352246284485,
"rewards/grpo_reward_func/mean": 0.42825716733932495,
"rewards/grpo_reward_func/std": 0.1340746283531189,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.78125,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.96875,
"learning_rate": 7.066666666666666e-07,
"loss": 0.0,
"num_tokens": 1106336.0,
"reward": 0.40863943099975586,
"reward_std": 0.061242297291755676,
"rewards/grpo_reward_func/mean": 0.40863943099975586,
"rewards/grpo_reward_func/std": 0.11059094965457916,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.8125,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.875,
"learning_rate": 7.033333333333333e-07,
"loss": 0.0,
"num_tokens": 1118756.0,
"reward": 0.44183290004730225,
"reward_std": 0.1359260380268097,
"rewards/grpo_reward_func/mean": 0.44183290004730225,
"rewards/grpo_reward_func/std": 0.15313053131103516,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.84375,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.9375,
"learning_rate": 7e-07,
"loss": -0.0,
"num_tokens": 1131108.0,
"reward": 0.4604765474796295,
"reward_std": 0.09057141840457916,
"rewards/grpo_reward_func/mean": 0.4604765474796295,
"rewards/grpo_reward_func/std": 0.17239472270011902,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.875,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.84375,
"learning_rate": 6.966666666666666e-07,
"loss": -0.0,
"num_tokens": 1143520.0,
"reward": 0.37243229150772095,
"reward_std": 0.07444402575492859,
"rewards/grpo_reward_func/mean": 0.37243229150772095,
"rewards/grpo_reward_func/std": 0.1061118021607399,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.90625,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.25,
"learning_rate": 6.933333333333333e-07,
"loss": -0.0,
"num_tokens": 1156008.0,
"reward": 0.5441325902938843,
"reward_std": 0.11369525641202927,
"rewards/grpo_reward_func/mean": 0.5441325902938843,
"rewards/grpo_reward_func/std": 0.11172118782997131,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.9375,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.3125,
"learning_rate": 6.9e-07,
"loss": -0.0,
"num_tokens": 1168452.0,
"reward": 0.4581533670425415,
"reward_std": 0.11172451823949814,
"rewards/grpo_reward_func/mean": 0.4581533670425415,
"rewards/grpo_reward_func/std": 0.1257813274860382,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 2.96875,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.6875,
"learning_rate": 6.866666666666666e-07,
"loss": -0.0,
"num_tokens": 1180928.0,
"reward": 0.4434836208820343,
"reward_std": 0.14923422038555145,
"rewards/grpo_reward_func/mean": 0.4434836208820343,
"rewards/grpo_reward_func/std": 0.1542947143316269,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.5625,
"learning_rate": 6.833333333333333e-07,
"loss": -0.0,
"num_tokens": 1193352.0,
"reward": 0.3983464241027832,
"reward_std": 0.08742759376764297,
"rewards/grpo_reward_func/mean": 0.3983464241027832,
"rewards/grpo_reward_func/std": 0.12986424565315247,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.03125,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.875,
"learning_rate": 6.800000000000001e-07,
"loss": -0.0,
"num_tokens": 1205796.0,
"reward": 0.5015304088592529,
"reward_std": 0.08956287801265717,
"rewards/grpo_reward_func/mean": 0.5015304088592529,
"rewards/grpo_reward_func/std": 0.08333175629377365,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.0625,
"frac_reward_zero_std": 0.0,
"grad_norm": 16.875,
"learning_rate": 6.766666666666666e-07,
"loss": 0.0,
"num_tokens": 1218244.0,
"reward": 0.47066164016723633,
"reward_std": 0.19255688786506653,
"rewards/grpo_reward_func/mean": 0.47066164016723633,
"rewards/grpo_reward_func/std": 0.1828991174697876,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.09375,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.5625,
"learning_rate": 6.733333333333333e-07,
"loss": 0.0,
"num_tokens": 1230632.0,
"reward": 0.49644234776496887,
"reward_std": 0.10233695805072784,
"rewards/grpo_reward_func/mean": 0.49644234776496887,
"rewards/grpo_reward_func/std": 0.09938962757587433,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.125,
"frac_reward_zero_std": 0.0,
"grad_norm": 17.0,
"learning_rate": 6.7e-07,
"loss": -0.0,
"num_tokens": 1243024.0,
"reward": 0.48214682936668396,
"reward_std": 0.1728937327861786,
"rewards/grpo_reward_func/mean": 0.48214682936668396,
"rewards/grpo_reward_func/std": 0.16634704172611237,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.15625,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.1875,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0,
"num_tokens": 1255484.0,
"reward": 0.4351156949996948,
"reward_std": 0.12486094236373901,
"rewards/grpo_reward_func/mean": 0.4351156949996948,
"rewards/grpo_reward_func/std": 0.1314164698123932,
"step": 101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.1875,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.4375,
"learning_rate": 6.633333333333333e-07,
"loss": 0.0,
"num_tokens": 1267856.0,
"reward": 0.38795578479766846,
"reward_std": 0.1968497335910797,
"rewards/grpo_reward_func/mean": 0.38795578479766846,
"rewards/grpo_reward_func/std": 0.18232691287994385,
"step": 102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.21875,
"frac_reward_zero_std": 0.0,
"grad_norm": 15.0625,
"learning_rate": 6.6e-07,
"loss": 0.0,
"num_tokens": 1280280.0,
"reward": 0.3891702890396118,
"reward_std": 0.09787960350513458,
"rewards/grpo_reward_func/mean": 0.3891702890396118,
"rewards/grpo_reward_func/std": 0.09284209460020065,
"step": 103
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.25,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.06298828125,
"learning_rate": 6.566666666666666e-07,
"loss": 0.0,
"num_tokens": 1292752.0,
"reward": 0.39056217670440674,
"reward_std": 0.04999999329447746,
"rewards/grpo_reward_func/mean": 0.39056217670440674,
"rewards/grpo_reward_func/std": 0.04636901617050171,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.28125,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.75,
"learning_rate": 6.533333333333333e-07,
"loss": -0.0,
"num_tokens": 1305200.0,
"reward": 0.4605242609977722,
"reward_std": 0.13093939423561096,
"rewards/grpo_reward_func/mean": 0.4605242609977722,
"rewards/grpo_reward_func/std": 0.15952207148075104,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.3125,
"frac_reward_zero_std": 0.0,
"grad_norm": 15.6875,
"learning_rate": 6.5e-07,
"loss": -0.0,
"num_tokens": 1317660.0,
"reward": 0.3946014940738678,
"reward_std": 0.09192033857107162,
"rewards/grpo_reward_func/mean": 0.3946014940738678,
"rewards/grpo_reward_func/std": 0.10782631486654282,
"step": 106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.34375,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.6875,
"learning_rate": 6.466666666666666e-07,
"loss": 0.0,
"num_tokens": 1330068.0,
"reward": 0.4714941084384918,
"reward_std": 0.09265273809432983,
"rewards/grpo_reward_func/mean": 0.4714941084384918,
"rewards/grpo_reward_func/std": 0.12330163270235062,
"step": 107
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.375,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.5625,
"learning_rate": 6.433333333333332e-07,
"loss": 0.0,
"num_tokens": 1342620.0,
"reward": 0.49374109506607056,
"reward_std": 0.0895591527223587,
"rewards/grpo_reward_func/mean": 0.49374109506607056,
"rewards/grpo_reward_func/std": 0.1332620531320572,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.40625,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.625,
"learning_rate": 6.4e-07,
"loss": 0.0,
"num_tokens": 1355016.0,
"reward": 0.3305853009223938,
"reward_std": 0.04621565341949463,
"rewards/grpo_reward_func/mean": 0.3305853009223938,
"rewards/grpo_reward_func/std": 0.04419610649347305,
"step": 109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.4375,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.6875,
"learning_rate": 6.366666666666667e-07,
"loss": -0.0,
"num_tokens": 1367452.0,
"reward": 0.5173900723457336,
"reward_std": 0.14908233284950256,
"rewards/grpo_reward_func/mean": 0.5173900723457336,
"rewards/grpo_reward_func/std": 0.15880633890628815,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.46875,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.9375,
"learning_rate": 6.333333333333332e-07,
"loss": -0.0,
"num_tokens": 1379760.0,
"reward": 0.3797125816345215,
"reward_std": 0.10961093008518219,
"rewards/grpo_reward_func/mean": 0.3797125816345215,
"rewards/grpo_reward_func/std": 0.12369874864816666,
"step": 111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.5,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.875,
"learning_rate": 6.3e-07,
"loss": 0.0,
"num_tokens": 1392296.0,
"reward": 0.3952435255050659,
"reward_std": 0.07089774310588837,
"rewards/grpo_reward_func/mean": 0.3952435255050659,
"rewards/grpo_reward_func/std": 0.09734237939119339,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.53125,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.25,
"learning_rate": 6.266666666666667e-07,
"loss": 0.0,
"num_tokens": 1404748.0,
"reward": 0.4383198916912079,
"reward_std": 0.08845233917236328,
"rewards/grpo_reward_func/mean": 0.4383198916912079,
"rewards/grpo_reward_func/std": 0.08347002416849136,
"step": 113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.5625,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.125,
"learning_rate": 6.233333333333332e-07,
"loss": -0.0,
"num_tokens": 1417172.0,
"reward": 0.3984643220901489,
"reward_std": 0.08412055671215057,
"rewards/grpo_reward_func/mean": 0.3984643220901489,
"rewards/grpo_reward_func/std": 0.08139137923717499,
"step": 114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.59375,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.0,
"learning_rate": 6.2e-07,
"loss": -0.0,
"num_tokens": 1429572.0,
"reward": 0.3756071925163269,
"reward_std": 0.1621457189321518,
"rewards/grpo_reward_func/mean": 0.3756071925163269,
"rewards/grpo_reward_func/std": 0.16212420165538788,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.625,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.0625,
"learning_rate": 6.166666666666667e-07,
"loss": -0.0,
"num_tokens": 1441984.0,
"reward": 0.3367416262626648,
"reward_std": 0.10579686611890793,
"rewards/grpo_reward_func/mean": 0.3367416262626648,
"rewards/grpo_reward_func/std": 0.12276742607355118,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.65625,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.09375,
"learning_rate": 6.133333333333332e-07,
"loss": -0.0,
"num_tokens": 1454520.0,
"reward": 0.33171868324279785,
"reward_std": 0.05540106073021889,
"rewards/grpo_reward_func/mean": 0.33171868324279785,
"rewards/grpo_reward_func/std": 0.05543047562241554,
"step": 117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.6875,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.4375,
"learning_rate": 6.1e-07,
"loss": -0.0,
"num_tokens": 1466968.0,
"reward": 0.46069973707199097,
"reward_std": 0.08953073620796204,
"rewards/grpo_reward_func/mean": 0.46069973707199097,
"rewards/grpo_reward_func/std": 0.10067260265350342,
"step": 118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.71875,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.5,
"learning_rate": 6.066666666666666e-07,
"loss": 0.0,
"num_tokens": 1479328.0,
"reward": 0.49788278341293335,
"reward_std": 0.12688566744327545,
"rewards/grpo_reward_func/mean": 0.49788278341293335,
"rewards/grpo_reward_func/std": 0.12214919179677963,
"step": 119
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.75,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.875,
"learning_rate": 6.033333333333333e-07,
"loss": 0.0,
"num_tokens": 1491788.0,
"reward": 0.35892003774642944,
"reward_std": 0.0625436007976532,
"rewards/grpo_reward_func/mean": 0.35892003774642944,
"rewards/grpo_reward_func/std": 0.09081238508224487,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.78125,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.0625,
"learning_rate": 6e-07,
"loss": 0.0,
"num_tokens": 1504220.0,
"reward": 0.38591668009757996,
"reward_std": 0.15822480618953705,
"rewards/grpo_reward_func/mean": 0.38591668009757996,
"rewards/grpo_reward_func/std": 0.16854539513587952,
"step": 121
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.8125,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.25,
"learning_rate": 5.966666666666666e-07,
"loss": -0.0,
"num_tokens": 1516652.0,
"reward": 0.43537092208862305,
"reward_std": 0.14132292568683624,
"rewards/grpo_reward_func/mean": 0.43537092208862305,
"rewards/grpo_reward_func/std": 0.15050342679023743,
"step": 122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.84375,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.125,
"learning_rate": 5.933333333333334e-07,
"loss": -0.0,
"num_tokens": 1529072.0,
"reward": 0.4112863540649414,
"reward_std": 0.08730175346136093,
"rewards/grpo_reward_func/mean": 0.4112863540649414,
"rewards/grpo_reward_func/std": 0.09073270857334137,
"step": 123
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.875,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.0625,
"learning_rate": 5.9e-07,
"loss": -0.0,
"num_tokens": 1541488.0,
"reward": 0.3833653926849365,
"reward_std": 0.09057098627090454,
"rewards/grpo_reward_func/mean": 0.3833653926849365,
"rewards/grpo_reward_func/std": 0.08530126512050629,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.90625,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.875,
"learning_rate": 5.866666666666666e-07,
"loss": 0.0,
"num_tokens": 1553812.0,
"reward": 0.5172641277313232,
"reward_std": 0.08300620317459106,
"rewards/grpo_reward_func/mean": 0.5172641277313232,
"rewards/grpo_reward_func/std": 0.18922077119350433,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.9375,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.9375,
"learning_rate": 5.833333333333334e-07,
"loss": 0.0,
"num_tokens": 1566244.0,
"reward": 0.45866021513938904,
"reward_std": 0.13558343052864075,
"rewards/grpo_reward_func/mean": 0.45866021513938904,
"rewards/grpo_reward_func/std": 0.12821511924266815,
"step": 126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 3.96875,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.46875,
"learning_rate": 5.8e-07,
"loss": -0.0,
"num_tokens": 1578680.0,
"reward": 0.4404694437980652,
"reward_std": 0.058066606521606445,
"rewards/grpo_reward_func/mean": 0.4404694437980652,
"rewards/grpo_reward_func/std": 0.057657789438962936,
"step": 127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.9375,
"learning_rate": 5.766666666666666e-07,
"loss": -0.0,
"num_tokens": 1591136.0,
"reward": 0.3580424189567566,
"reward_std": 0.07987552881240845,
"rewards/grpo_reward_func/mean": 0.3580424189567566,
"rewards/grpo_reward_func/std": 0.0977816954255104,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.03125,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.875,
"learning_rate": 5.733333333333334e-07,
"loss": 0.0,
"num_tokens": 1603604.0,
"reward": 0.3891274929046631,
"reward_std": 0.15381482243537903,
"rewards/grpo_reward_func/mean": 0.3891274929046631,
"rewards/grpo_reward_func/std": 0.17152857780456543,
"step": 129
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.0625,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.25,
"learning_rate": 5.699999999999999e-07,
"loss": 0.0,
"num_tokens": 1616044.0,
"reward": 0.27857083082199097,
"reward_std": 0.09501777589321136,
"rewards/grpo_reward_func/mean": 0.27857083082199097,
"rewards/grpo_reward_func/std": 0.1052025854587555,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.09375,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.1875,
"learning_rate": 5.666666666666666e-07,
"loss": -0.0,
"num_tokens": 1628436.0,
"reward": 0.35340362787246704,
"reward_std": 0.16999280452728271,
"rewards/grpo_reward_func/mean": 0.35340362787246704,
"rewards/grpo_reward_func/std": 0.16278210282325745,
"step": 131
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.125,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.3125,
"learning_rate": 5.633333333333334e-07,
"loss": -0.0,
"num_tokens": 1640824.0,
"reward": 0.4199197590351105,
"reward_std": 0.08985067158937454,
"rewards/grpo_reward_func/mean": 0.4199197590351105,
"rewards/grpo_reward_func/std": 0.09818078577518463,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.15625,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.0,
"learning_rate": 5.6e-07,
"loss": -0.0,
"num_tokens": 1653220.0,
"reward": 0.44602805376052856,
"reward_std": 0.10932175815105438,
"rewards/grpo_reward_func/mean": 0.44602805376052856,
"rewards/grpo_reward_func/std": 0.11537235230207443,
"step": 133
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.1875,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.5625,
"learning_rate": 5.566666666666666e-07,
"loss": -0.0,
"num_tokens": 1665684.0,
"reward": 0.4218568205833435,
"reward_std": 0.09915173053741455,
"rewards/grpo_reward_func/mean": 0.4218568205833435,
"rewards/grpo_reward_func/std": 0.1479072868824005,
"step": 134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.21875,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.375,
"learning_rate": 5.533333333333334e-07,
"loss": -0.0,
"num_tokens": 1678120.0,
"reward": 0.3699283301830292,
"reward_std": 0.05628474801778793,
"rewards/grpo_reward_func/mean": 0.3699283301830292,
"rewards/grpo_reward_func/std": 0.055360160768032074,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.25,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.625,
"learning_rate": 5.5e-07,
"loss": 0.0,
"num_tokens": 1690616.0,
"reward": 0.43144893646240234,
"reward_std": 0.097145214676857,
"rewards/grpo_reward_func/mean": 0.43144893646240234,
"rewards/grpo_reward_func/std": 0.09757841378450394,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.28125,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.875,
"learning_rate": 5.466666666666666e-07,
"loss": -0.0,
"num_tokens": 1703048.0,
"reward": 0.37039631605148315,
"reward_std": 0.06340405344963074,
"rewards/grpo_reward_func/mean": 0.37039631605148315,
"rewards/grpo_reward_func/std": 0.10630898922681808,
"step": 137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.3125,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.5,
"learning_rate": 5.433333333333334e-07,
"loss": -0.0,
"num_tokens": 1715404.0,
"reward": 0.44485020637512207,
"reward_std": 0.061223354190588,
"rewards/grpo_reward_func/mean": 0.44485020637512207,
"rewards/grpo_reward_func/std": 0.0653579831123352,
"step": 138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.34375,
"frac_reward_zero_std": 0.0,
"grad_norm": 18.75,
"learning_rate": 5.4e-07,
"loss": -0.0,
"num_tokens": 1727876.0,
"reward": 0.4389991760253906,
"reward_std": 0.12622228264808655,
"rewards/grpo_reward_func/mean": 0.4389991760253906,
"rewards/grpo_reward_func/std": 0.1206517443060875,
"step": 139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.375,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.3125,
"learning_rate": 5.366666666666666e-07,
"loss": 0.0,
"num_tokens": 1740252.0,
"reward": 0.3506331741809845,
"reward_std": 0.1391739398241043,
"rewards/grpo_reward_func/mean": 0.3506331741809845,
"rewards/grpo_reward_func/std": 0.14306746423244476,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.40625,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.8125,
"learning_rate": 5.333333333333333e-07,
"loss": -0.0,
"num_tokens": 1752632.0,
"reward": 0.5316411256790161,
"reward_std": 0.10773089528083801,
"rewards/grpo_reward_func/mean": 0.5316411256790161,
"rewards/grpo_reward_func/std": 0.16645555198192596,
"step": 141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.4375,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.40625,
"learning_rate": 5.3e-07,
"loss": -0.0,
"num_tokens": 1765040.0,
"reward": 0.3930637836456299,
"reward_std": 0.07452228665351868,
"rewards/grpo_reward_func/mean": 0.3930637836456299,
"rewards/grpo_reward_func/std": 0.07487671822309494,
"step": 142
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 11.875,
"completions/mean_terminated_length": 11.875,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 4.46875,
"frac_reward_zero_std": 0.0,
"grad_norm": 15.25,
"learning_rate": 5.266666666666666e-07,
"loss": -0.0142,
"num_tokens": 1777423.0,
"reward": 0.3444192409515381,
"reward_std": 0.1598653644323349,
"rewards/grpo_reward_func/mean": 0.3444192409515381,
"rewards/grpo_reward_func/std": 0.18078266084194183,
"step": 143
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.5,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.0,
"learning_rate": 5.233333333333333e-07,
"loss": 0.0,
"num_tokens": 1789683.0,
"reward": 0.5174664258956909,
"reward_std": 0.07813962548971176,
"rewards/grpo_reward_func/mean": 0.5174664258956909,
"rewards/grpo_reward_func/std": 0.10316640138626099,
"step": 144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.53125,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.625,
"learning_rate": 5.2e-07,
"loss": 0.0,
"num_tokens": 1802119.0,
"reward": 0.3699246048927307,
"reward_std": 0.08162573724985123,
"rewards/grpo_reward_func/mean": 0.3699246048927307,
"rewards/grpo_reward_func/std": 0.09686075896024704,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.5625,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.0,
"learning_rate": 5.166666666666667e-07,
"loss": 0.0,
"num_tokens": 1814483.0,
"reward": 0.4604162275791168,
"reward_std": 0.19567811489105225,
"rewards/grpo_reward_func/mean": 0.4604162275791168,
"rewards/grpo_reward_func/std": 0.19948698580265045,
"step": 146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.59375,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.90625,
"learning_rate": 5.133333333333333e-07,
"loss": -0.0,
"num_tokens": 1827055.0,
"reward": 0.41122761368751526,
"reward_std": 0.08153079450130463,
"rewards/grpo_reward_func/mean": 0.41122761368751526,
"rewards/grpo_reward_func/std": 0.08045266568660736,
"step": 147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.625,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8125,
"learning_rate": 5.1e-07,
"loss": 0.0,
"num_tokens": 1839535.0,
"reward": 0.3292653560638428,
"reward_std": 0.04870126396417618,
"rewards/grpo_reward_func/mean": 0.3292653560638428,
"rewards/grpo_reward_func/std": 0.07768747955560684,
"step": 148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.65625,
"frac_reward_zero_std": 0.0,
"grad_norm": 17.5,
"learning_rate": 5.066666666666667e-07,
"loss": -0.0,
"num_tokens": 1852003.0,
"reward": 0.4356845021247864,
"reward_std": 0.11020061373710632,
"rewards/grpo_reward_func/mean": 0.4356845021247864,
"rewards/grpo_reward_func/std": 0.12760911881923676,
"step": 149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.6875,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.1875,
"learning_rate": 5.033333333333333e-07,
"loss": 0.0,
"num_tokens": 1864487.0,
"reward": 0.47176241874694824,
"reward_std": 0.1466352343559265,
"rewards/grpo_reward_func/mean": 0.47176241874694824,
"rewards/grpo_reward_func/std": 0.15562468767166138,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.71875,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.5,
"learning_rate": 5e-07,
"loss": 0.0,
"num_tokens": 1876939.0,
"reward": 0.49293607473373413,
"reward_std": 0.15847747027873993,
"rewards/grpo_reward_func/mean": 0.49293607473373413,
"rewards/grpo_reward_func/std": 0.16349899768829346,
"step": 151
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.75,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.1875,
"learning_rate": 4.966666666666666e-07,
"loss": -0.0,
"num_tokens": 1889499.0,
"reward": 0.4915664792060852,
"reward_std": 0.19223570823669434,
"rewards/grpo_reward_func/mean": 0.4915664792060852,
"rewards/grpo_reward_func/std": 0.1780252456665039,
"step": 152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.78125,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.1875,
"learning_rate": 4.933333333333333e-07,
"loss": -0.0,
"num_tokens": 1901911.0,
"reward": 0.39836806058883667,
"reward_std": 0.08220314979553223,
"rewards/grpo_reward_func/mean": 0.39836806058883667,
"rewards/grpo_reward_func/std": 0.09293971210718155,
"step": 153
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.8125,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.0625,
"learning_rate": 4.9e-07,
"loss": 0.0,
"num_tokens": 1914267.0,
"reward": 0.5052293539047241,
"reward_std": 0.05901884660124779,
"rewards/grpo_reward_func/mean": 0.5052293539047241,
"rewards/grpo_reward_func/std": 0.07250750809907913,
"step": 154
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.84375,
"frac_reward_zero_std": 0.0,
"grad_norm": 15.8125,
"learning_rate": 4.866666666666666e-07,
"loss": -0.0,
"num_tokens": 1926679.0,
"reward": 0.2826748192310333,
"reward_std": 0.0776633769273758,
"rewards/grpo_reward_func/mean": 0.2826748192310333,
"rewards/grpo_reward_func/std": 0.07334372401237488,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.875,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.125,
"learning_rate": 4.833333333333333e-07,
"loss": -0.0,
"num_tokens": 1939135.0,
"reward": 0.38298332691192627,
"reward_std": 0.15204550325870514,
"rewards/grpo_reward_func/mean": 0.38298332691192627,
"rewards/grpo_reward_func/std": 0.17793436348438263,
"step": 156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.90625,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.625,
"learning_rate": 4.8e-07,
"loss": 0.0,
"num_tokens": 1951555.0,
"reward": 0.45585888624191284,
"reward_std": 0.08215408027172089,
"rewards/grpo_reward_func/mean": 0.45585888624191284,
"rewards/grpo_reward_func/std": 0.08240208774805069,
"step": 157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.9375,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.8125,
"learning_rate": 4.7666666666666667e-07,
"loss": 0.0,
"num_tokens": 1964007.0,
"reward": 0.4660055935382843,
"reward_std": 0.17032964527606964,
"rewards/grpo_reward_func/mean": 0.4660055935382843,
"rewards/grpo_reward_func/std": 0.1751418560743332,
"step": 158
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 4.96875,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.5,
"learning_rate": 4.733333333333333e-07,
"loss": 0.0,
"num_tokens": 1976483.0,
"reward": 0.5173270106315613,
"reward_std": 0.17288881540298462,
"rewards/grpo_reward_func/mean": 0.5173270106315613,
"rewards/grpo_reward_func/std": 0.1660362035036087,
"step": 159
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.125,
"learning_rate": 4.6999999999999995e-07,
"loss": 0.0,
"num_tokens": 1988919.0,
"reward": 0.4136430323123932,
"reward_std": 0.22560492157936096,
"rewards/grpo_reward_func/mean": 0.4136430323123932,
"rewards/grpo_reward_func/std": 0.21545714139938354,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.03125,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.1875,
"learning_rate": 4.6666666666666666e-07,
"loss": -0.0,
"num_tokens": 2001379.0,
"reward": 0.43014535307884216,
"reward_std": 0.08317069709300995,
"rewards/grpo_reward_func/mean": 0.43014535307884216,
"rewards/grpo_reward_func/std": 0.07750457525253296,
"step": 161
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.0625,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.90625,
"learning_rate": 4.633333333333333e-07,
"loss": -0.0,
"num_tokens": 2013971.0,
"reward": 0.34895196557044983,
"reward_std": 0.0512375608086586,
"rewards/grpo_reward_func/mean": 0.34895196557044983,
"rewards/grpo_reward_func/std": 0.04775034263730049,
"step": 162
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.09375,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.34375,
"learning_rate": 4.6e-07,
"loss": 0.0,
"num_tokens": 2026375.0,
"reward": 0.3551255464553833,
"reward_std": 0.12043958902359009,
"rewards/grpo_reward_func/mean": 0.3551255464553833,
"rewards/grpo_reward_func/std": 0.13196633756160736,
"step": 163
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.125,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.1875,
"learning_rate": 4.5666666666666665e-07,
"loss": 0.0,
"num_tokens": 2038823.0,
"reward": 0.41090184450149536,
"reward_std": 0.11341163516044617,
"rewards/grpo_reward_func/mean": 0.41090184450149536,
"rewards/grpo_reward_func/std": 0.11507044732570648,
"step": 164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.15625,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.5625,
"learning_rate": 4.5333333333333326e-07,
"loss": -0.0,
"num_tokens": 2051219.0,
"reward": 0.3054584860801697,
"reward_std": 0.08504727482795715,
"rewards/grpo_reward_func/mean": 0.3054584860801697,
"rewards/grpo_reward_func/std": 0.09004215151071548,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.1875,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.9375,
"learning_rate": 4.5e-07,
"loss": -0.0,
"num_tokens": 2063651.0,
"reward": 0.47015416622161865,
"reward_std": 0.15467038750648499,
"rewards/grpo_reward_func/mean": 0.47015416622161865,
"rewards/grpo_reward_func/std": 0.153534397482872,
"step": 166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.21875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.25,
"learning_rate": 4.4666666666666664e-07,
"loss": -0.0,
"num_tokens": 2075955.0,
"reward": 0.4894865155220032,
"reward_std": 0.07816055417060852,
"rewards/grpo_reward_func/mean": 0.4894865155220032,
"rewards/grpo_reward_func/std": 0.07534909248352051,
"step": 167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.25,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.8125,
"learning_rate": 4.4333333333333336e-07,
"loss": -0.0,
"num_tokens": 2088435.0,
"reward": 0.4603702425956726,
"reward_std": 0.15144219994544983,
"rewards/grpo_reward_func/mean": 0.4603702425956726,
"rewards/grpo_reward_func/std": 0.16273239254951477,
"step": 168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.28125,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.75,
"learning_rate": 4.3999999999999997e-07,
"loss": -0.0,
"num_tokens": 2100919.0,
"reward": 0.3637647032737732,
"reward_std": 0.06757047772407532,
"rewards/grpo_reward_func/mean": 0.3637647032737732,
"rewards/grpo_reward_func/std": 0.08233585953712463,
"step": 169
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.3125,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.8125,
"learning_rate": 4.3666666666666663e-07,
"loss": 0.0,
"num_tokens": 2113343.0,
"reward": 0.3543202579021454,
"reward_std": 0.08441969752311707,
"rewards/grpo_reward_func/mean": 0.3543202579021454,
"rewards/grpo_reward_func/std": 0.08902662247419357,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.34375,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.4375,
"learning_rate": 4.3333333333333335e-07,
"loss": 0.0,
"num_tokens": 2125791.0,
"reward": 0.47929999232292175,
"reward_std": 0.17670738697052002,
"rewards/grpo_reward_func/mean": 0.47929999232292175,
"rewards/grpo_reward_func/std": 0.17999567091464996,
"step": 171
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.375,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.5625,
"learning_rate": 4.2999999999999996e-07,
"loss": -0.0,
"num_tokens": 2138251.0,
"reward": 0.3452494740486145,
"reward_std": 0.08022183179855347,
"rewards/grpo_reward_func/mean": 0.3452494740486145,
"rewards/grpo_reward_func/std": 0.08067353814840317,
"step": 172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.40625,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.3125,
"learning_rate": 4.266666666666667e-07,
"loss": -0.0,
"num_tokens": 2150659.0,
"reward": 0.42406925559043884,
"reward_std": 0.2445584237575531,
"rewards/grpo_reward_func/mean": 0.42406925559043884,
"rewards/grpo_reward_func/std": 0.22746475040912628,
"step": 173
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.4375,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.34375,
"learning_rate": 4.2333333333333334e-07,
"loss": -0.0,
"num_tokens": 2163163.0,
"reward": 0.4360213279724121,
"reward_std": 0.07188587635755539,
"rewards/grpo_reward_func/mean": 0.4360213279724121,
"rewards/grpo_reward_func/std": 0.07112448662519455,
"step": 174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.46875,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.65625,
"learning_rate": 4.1999999999999995e-07,
"loss": -0.0,
"num_tokens": 2175679.0,
"reward": 0.40680232644081116,
"reward_std": 0.054570674896240234,
"rewards/grpo_reward_func/mean": 0.40680232644081116,
"rewards/grpo_reward_func/std": 0.05052686110138893,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.5,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.1875,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.0,
"num_tokens": 2188199.0,
"reward": 0.4191306233406067,
"reward_std": 0.11386445164680481,
"rewards/grpo_reward_func/mean": 0.4191306233406067,
"rewards/grpo_reward_func/std": 0.1961081475019455,
"step": 176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.53125,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.4375,
"learning_rate": 4.1333333333333333e-07,
"loss": -0.0,
"num_tokens": 2200571.0,
"reward": 0.49487611651420593,
"reward_std": 0.18403539061546326,
"rewards/grpo_reward_func/mean": 0.49487611651420593,
"rewards/grpo_reward_func/std": 0.17239995300769806,
"step": 177
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.5625,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.0625,
"learning_rate": 4.0999999999999994e-07,
"loss": 0.0,
"num_tokens": 2212955.0,
"reward": 0.5409983396530151,
"reward_std": 0.12222976982593536,
"rewards/grpo_reward_func/mean": 0.5409983396530151,
"rewards/grpo_reward_func/std": 0.11841105669736862,
"step": 178
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.59375,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1484375,
"learning_rate": 4.0666666666666666e-07,
"loss": 0.0,
"num_tokens": 2225275.0,
"reward": 0.3465573191642761,
"reward_std": 0.05415717512369156,
"rewards/grpo_reward_func/mean": 0.3465573191642761,
"rewards/grpo_reward_func/std": 0.08967100828886032,
"step": 179
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.625,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.625,
"learning_rate": 4.033333333333333e-07,
"loss": -0.0,
"num_tokens": 2237591.0,
"reward": 0.40769240260124207,
"reward_std": 0.061508819460868835,
"rewards/grpo_reward_func/mean": 0.40769240260124207,
"rewards/grpo_reward_func/std": 0.11384513974189758,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.65625,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.75,
"learning_rate": 4e-07,
"loss": 0.0,
"num_tokens": 2249983.0,
"reward": 0.4172666072845459,
"reward_std": 0.04795217514038086,
"rewards/grpo_reward_func/mean": 0.4172666072845459,
"rewards/grpo_reward_func/std": 0.06908071041107178,
"step": 181
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.6875,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.0,
"learning_rate": 3.9666666666666665e-07,
"loss": 0.0,
"num_tokens": 2262391.0,
"reward": 0.4887160658836365,
"reward_std": 0.0936364233493805,
"rewards/grpo_reward_func/mean": 0.4887160658836365,
"rewards/grpo_reward_func/std": 0.12800146639347076,
"step": 182
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.71875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.8125,
"learning_rate": 3.933333333333333e-07,
"loss": 0.0,
"num_tokens": 2274847.0,
"reward": 0.5363283157348633,
"reward_std": 0.09925331920385361,
"rewards/grpo_reward_func/mean": 0.5363283157348633,
"rewards/grpo_reward_func/std": 0.09658796340227127,
"step": 183
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.75,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.625,
"learning_rate": 3.8999999999999997e-07,
"loss": 0.0,
"num_tokens": 2287319.0,
"reward": 0.41678112745285034,
"reward_std": 0.13148340582847595,
"rewards/grpo_reward_func/mean": 0.41678112745285034,
"rewards/grpo_reward_func/std": 0.1416279673576355,
"step": 184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.78125,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.5,
"learning_rate": 3.8666666666666664e-07,
"loss": -0.0,
"num_tokens": 2299739.0,
"reward": 0.49028465151786804,
"reward_std": 0.09930803626775742,
"rewards/grpo_reward_func/mean": 0.49028465151786804,
"rewards/grpo_reward_func/std": 0.1043338030576706,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.8125,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.1875,
"learning_rate": 3.8333333333333335e-07,
"loss": 0.0,
"num_tokens": 2312179.0,
"reward": 0.42906028032302856,
"reward_std": 0.09733951836824417,
"rewards/grpo_reward_func/mean": 0.42906028032302856,
"rewards/grpo_reward_func/std": 0.101521797478199,
"step": 186
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.84375,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.59375,
"learning_rate": 3.7999999999999996e-07,
"loss": -0.0,
"num_tokens": 2324643.0,
"reward": 0.5322451591491699,
"reward_std": 0.05065479129552841,
"rewards/grpo_reward_func/mean": 0.5322451591491699,
"rewards/grpo_reward_func/std": 0.10973110795021057,
"step": 187
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.875,
"frac_reward_zero_std": 0.0,
"grad_norm": 15.25,
"learning_rate": 3.766666666666666e-07,
"loss": 0.0,
"num_tokens": 2337095.0,
"reward": 0.357377290725708,
"reward_std": 0.11668767035007477,
"rewards/grpo_reward_func/mean": 0.357377290725708,
"rewards/grpo_reward_func/std": 0.11811976879835129,
"step": 188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.90625,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.4375,
"learning_rate": 3.7333333333333334e-07,
"loss": 0.0,
"num_tokens": 2349527.0,
"reward": 0.4484630823135376,
"reward_std": 0.13092045485973358,
"rewards/grpo_reward_func/mean": 0.4484630823135376,
"rewards/grpo_reward_func/std": 0.18394383788108826,
"step": 189
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.9375,
"frac_reward_zero_std": 0.0,
"grad_norm": 16.625,
"learning_rate": 3.7e-07,
"loss": 0.0,
"num_tokens": 2361935.0,
"reward": 0.5536394119262695,
"reward_std": 0.1294117271900177,
"rewards/grpo_reward_func/mean": 0.5536394119262695,
"rewards/grpo_reward_func/std": 0.1366083174943924,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 5.96875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.6875,
"learning_rate": 3.666666666666666e-07,
"loss": -0.0,
"num_tokens": 2374335.0,
"reward": 0.43622025847435,
"reward_std": 0.036504555493593216,
"rewards/grpo_reward_func/mean": 0.43622025847435,
"rewards/grpo_reward_func/std": 0.04184015840291977,
"step": 191
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.375,
"learning_rate": 3.6333333333333333e-07,
"loss": 0.0,
"num_tokens": 2386703.0,
"reward": 0.32082119584083557,
"reward_std": 0.08303728699684143,
"rewards/grpo_reward_func/mean": 0.32082119584083557,
"rewards/grpo_reward_func/std": 0.09865312278270721,
"step": 192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.03125,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.0,
"learning_rate": 3.6e-07,
"loss": -0.0,
"num_tokens": 2399127.0,
"reward": 0.40079742670059204,
"reward_std": 0.12725131213665009,
"rewards/grpo_reward_func/mean": 0.40079742670059204,
"rewards/grpo_reward_func/std": 0.16112373769283295,
"step": 193
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.0625,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.6875,
"learning_rate": 3.5666666666666666e-07,
"loss": -0.0,
"num_tokens": 2411571.0,
"reward": 0.43647855520248413,
"reward_std": 0.10620959103107452,
"rewards/grpo_reward_func/mean": 0.43647855520248413,
"rewards/grpo_reward_func/std": 0.09909818321466446,
"step": 194
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.09375,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.6875,
"learning_rate": 3.533333333333333e-07,
"loss": 0.0,
"num_tokens": 2423979.0,
"reward": 0.3994408845901489,
"reward_std": 0.17607587575912476,
"rewards/grpo_reward_func/mean": 0.3994408845901489,
"rewards/grpo_reward_func/std": 0.17166729271411896,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.125,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.78125,
"learning_rate": 3.5e-07,
"loss": -0.0,
"num_tokens": 2436363.0,
"reward": 0.4736449420452118,
"reward_std": 0.09779857844114304,
"rewards/grpo_reward_func/mean": 0.4736449420452118,
"rewards/grpo_reward_func/std": 0.11548104882240295,
"step": 196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.15625,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.0,
"learning_rate": 3.4666666666666665e-07,
"loss": 0.0,
"num_tokens": 2448911.0,
"reward": 0.38275349140167236,
"reward_std": 0.07293462753295898,
"rewards/grpo_reward_func/mean": 0.38275349140167236,
"rewards/grpo_reward_func/std": 0.0916486382484436,
"step": 197
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.1875,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.125,
"learning_rate": 3.433333333333333e-07,
"loss": 0.0,
"num_tokens": 2461263.0,
"reward": 0.5372081995010376,
"reward_std": 0.20845532417297363,
"rewards/grpo_reward_func/mean": 0.5372081995010376,
"rewards/grpo_reward_func/std": 0.21515534818172455,
"step": 198
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.21875,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.125,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0,
"num_tokens": 2473707.0,
"reward": 0.415330171585083,
"reward_std": 0.15996377170085907,
"rewards/grpo_reward_func/mean": 0.415330171585083,
"rewards/grpo_reward_func/std": 0.18506671488285065,
"step": 199
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.25,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.0625,
"learning_rate": 3.3666666666666664e-07,
"loss": -0.0,
"num_tokens": 2486079.0,
"reward": 0.41273248195648193,
"reward_std": 0.050071652978658676,
"rewards/grpo_reward_func/mean": 0.41273248195648193,
"rewards/grpo_reward_func/std": 0.11006694287061691,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.28125,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.0625,
"learning_rate": 3.333333333333333e-07,
"loss": -0.0,
"num_tokens": 2498575.0,
"reward": 0.40237534046173096,
"reward_std": 0.16011598706245422,
"rewards/grpo_reward_func/mean": 0.40237534046173096,
"rewards/grpo_reward_func/std": 0.1658114790916443,
"step": 201
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.3125,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.125,
"learning_rate": 3.3e-07,
"loss": 0.0,
"num_tokens": 2511047.0,
"reward": 0.40720921754837036,
"reward_std": 0.10842312127351761,
"rewards/grpo_reward_func/mean": 0.40720921754837036,
"rewards/grpo_reward_func/std": 0.15348494052886963,
"step": 202
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.34375,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.75,
"learning_rate": 3.2666666666666663e-07,
"loss": -0.0,
"num_tokens": 2523499.0,
"reward": 0.46542418003082275,
"reward_std": 0.1260077953338623,
"rewards/grpo_reward_func/mean": 0.46542418003082275,
"rewards/grpo_reward_func/std": 0.1437770575284958,
"step": 203
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.375,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.4375,
"learning_rate": 3.233333333333333e-07,
"loss": -0.0,
"num_tokens": 2535931.0,
"reward": 0.4416119456291199,
"reward_std": 0.10100536048412323,
"rewards/grpo_reward_func/mean": 0.4416119456291199,
"rewards/grpo_reward_func/std": 0.11735321581363678,
"step": 204
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.40625,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.1875,
"learning_rate": 3.2e-07,
"loss": -0.0,
"num_tokens": 2548287.0,
"reward": 0.40553370118141174,
"reward_std": 0.13550561666488647,
"rewards/grpo_reward_func/mean": 0.40553370118141174,
"rewards/grpo_reward_func/std": 0.13743624091148376,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.4375,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.75,
"learning_rate": 3.166666666666666e-07,
"loss": -0.0,
"num_tokens": 2560711.0,
"reward": 0.35497111082077026,
"reward_std": 0.11463560163974762,
"rewards/grpo_reward_func/mean": 0.35497111082077026,
"rewards/grpo_reward_func/std": 0.12006353586912155,
"step": 206
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.46875,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.5625,
"learning_rate": 3.1333333333333333e-07,
"loss": 0.0,
"num_tokens": 2573143.0,
"reward": 0.4096822142601013,
"reward_std": 0.05833249166607857,
"rewards/grpo_reward_func/mean": 0.4096822142601013,
"rewards/grpo_reward_func/std": 0.08212708681821823,
"step": 207
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.5,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.3125,
"learning_rate": 3.1e-07,
"loss": -0.0,
"num_tokens": 2585583.0,
"reward": 0.4554346799850464,
"reward_std": 0.12953370809555054,
"rewards/grpo_reward_func/mean": 0.4554346799850464,
"rewards/grpo_reward_func/std": 0.1593649685382843,
"step": 208
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.53125,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.375,
"learning_rate": 3.066666666666666e-07,
"loss": -0.0,
"num_tokens": 2598031.0,
"reward": 0.5756185054779053,
"reward_std": 0.0809057205915451,
"rewards/grpo_reward_func/mean": 0.5756185054779053,
"rewards/grpo_reward_func/std": 0.10910212993621826,
"step": 209
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.5625,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.9375,
"learning_rate": 3.033333333333333e-07,
"loss": 0.0,
"num_tokens": 2610507.0,
"reward": 0.39368146657943726,
"reward_std": 0.1372520923614502,
"rewards/grpo_reward_func/mean": 0.39368146657943726,
"rewards/grpo_reward_func/std": 0.13048243522644043,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.59375,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.1875,
"learning_rate": 3e-07,
"loss": 0.0,
"num_tokens": 2623039.0,
"reward": 0.3540037274360657,
"reward_std": 0.08001622557640076,
"rewards/grpo_reward_func/mean": 0.3540037274360657,
"rewards/grpo_reward_func/std": 0.08400996774435043,
"step": 211
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.625,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.8125,
"learning_rate": 2.966666666666667e-07,
"loss": 0.0,
"num_tokens": 2635395.0,
"reward": 0.44302040338516235,
"reward_std": 0.08312968909740448,
"rewards/grpo_reward_func/mean": 0.44302040338516235,
"rewards/grpo_reward_func/std": 0.0891660526394844,
"step": 212
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.65625,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.78125,
"learning_rate": 2.933333333333333e-07,
"loss": -0.0,
"num_tokens": 2647843.0,
"reward": 0.3831726610660553,
"reward_std": 0.05200519412755966,
"rewards/grpo_reward_func/mean": 0.3831726610660553,
"rewards/grpo_reward_func/std": 0.09167957305908203,
"step": 213
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.6875,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.4375,
"learning_rate": 2.9e-07,
"loss": 0.0,
"num_tokens": 2660251.0,
"reward": 0.3554950952529907,
"reward_std": 0.05713435262441635,
"rewards/grpo_reward_func/mean": 0.3554950952529907,
"rewards/grpo_reward_func/std": 0.0688985213637352,
"step": 214
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.71875,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.375,
"learning_rate": 2.866666666666667e-07,
"loss": -0.0,
"num_tokens": 2672727.0,
"reward": 0.326229453086853,
"reward_std": 0.08010618388652802,
"rewards/grpo_reward_func/mean": 0.326229453086853,
"rewards/grpo_reward_func/std": 0.08994052559137344,
"step": 215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.75,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.125,
"learning_rate": 2.833333333333333e-07,
"loss": -0.0,
"num_tokens": 2685119.0,
"reward": 0.4351205825805664,
"reward_std": 0.08398930728435516,
"rewards/grpo_reward_func/mean": 0.4351205825805664,
"rewards/grpo_reward_func/std": 0.08400265872478485,
"step": 216
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.78125,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.09375,
"learning_rate": 2.8e-07,
"loss": 0.0,
"num_tokens": 2697507.0,
"reward": 0.4564037621021271,
"reward_std": 0.08567321300506592,
"rewards/grpo_reward_func/mean": 0.4564037621021271,
"rewards/grpo_reward_func/std": 0.08155813813209534,
"step": 217
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.8125,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.8125,
"learning_rate": 2.766666666666667e-07,
"loss": -0.0,
"num_tokens": 2709927.0,
"reward": 0.4280545115470886,
"reward_std": 0.13084210455417633,
"rewards/grpo_reward_func/mean": 0.4280545115470886,
"rewards/grpo_reward_func/std": 0.13759230077266693,
"step": 218
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.84375,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.25,
"learning_rate": 2.733333333333333e-07,
"loss": 0.0,
"num_tokens": 2722331.0,
"reward": 0.44869235157966614,
"reward_std": 0.0902545154094696,
"rewards/grpo_reward_func/mean": 0.44869235157966614,
"rewards/grpo_reward_func/std": 0.1127212718129158,
"step": 219
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.375,
"learning_rate": 2.7e-07,
"loss": 0.0,
"num_tokens": 2734779.0,
"reward": 0.4759725332260132,
"reward_std": 0.12860512733459473,
"rewards/grpo_reward_func/mean": 0.4759725332260132,
"rewards/grpo_reward_func/std": 0.1384066343307495,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.90625,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.75,
"learning_rate": 2.6666666666666667e-07,
"loss": -0.0,
"num_tokens": 2747235.0,
"reward": 0.5738496780395508,
"reward_std": 0.11320274323225021,
"rewards/grpo_reward_func/mean": 0.5738496780395508,
"rewards/grpo_reward_func/std": 0.10654186457395554,
"step": 221
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.9375,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.9375,
"learning_rate": 2.633333333333333e-07,
"loss": -0.0,
"num_tokens": 2759643.0,
"reward": 0.33652713894844055,
"reward_std": 0.10561183094978333,
"rewards/grpo_reward_func/mean": 0.33652713894844055,
"rewards/grpo_reward_func/std": 0.11127988249063492,
"step": 222
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 6.96875,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.25,
"learning_rate": 2.6e-07,
"loss": 0.0,
"num_tokens": 2772083.0,
"reward": 0.45456087589263916,
"reward_std": 0.21474137902259827,
"rewards/grpo_reward_func/mean": 0.45456087589263916,
"rewards/grpo_reward_func/std": 0.20742803812026978,
"step": 223
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.25,
"learning_rate": 2.5666666666666666e-07,
"loss": -0.0,
"num_tokens": 2784487.0,
"reward": 0.36959922313690186,
"reward_std": 0.12393350899219513,
"rewards/grpo_reward_func/mean": 0.36959922313690186,
"rewards/grpo_reward_func/std": 0.18545781075954437,
"step": 224
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.03125,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.375,
"learning_rate": 2.533333333333333e-07,
"loss": 0.0,
"num_tokens": 2796895.0,
"reward": 0.5148861408233643,
"reward_std": 0.10401658713817596,
"rewards/grpo_reward_func/mean": 0.5148861408233643,
"rewards/grpo_reward_func/std": 0.10146593302488327,
"step": 225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.0625,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.6875,
"learning_rate": 2.5e-07,
"loss": 0.0,
"num_tokens": 2809283.0,
"reward": 0.3833024799823761,
"reward_std": 0.07489189505577087,
"rewards/grpo_reward_func/mean": 0.3833024799823761,
"rewards/grpo_reward_func/std": 0.07100249826908112,
"step": 226
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.09375,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.5625,
"learning_rate": 2.4666666666666665e-07,
"loss": 0.0,
"num_tokens": 2821811.0,
"reward": 0.37905335426330566,
"reward_std": 0.09207235276699066,
"rewards/grpo_reward_func/mean": 0.37905335426330566,
"rewards/grpo_reward_func/std": 0.10075780749320984,
"step": 227
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.125,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0693359375,
"learning_rate": 2.433333333333333e-07,
"loss": 0.0,
"num_tokens": 2834259.0,
"reward": 0.5241298079490662,
"reward_std": 0.050000011920928955,
"rewards/grpo_reward_func/mean": 0.5241298079490662,
"rewards/grpo_reward_func/std": 0.11461541801691055,
"step": 228
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.15625,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.4375,
"learning_rate": 2.4e-07,
"loss": -0.0,
"num_tokens": 2846667.0,
"reward": 0.38863605260849,
"reward_std": 0.09145700931549072,
"rewards/grpo_reward_func/mean": 0.38863605260849,
"rewards/grpo_reward_func/std": 0.0854310691356659,
"step": 229
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.1875,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.6875,
"learning_rate": 2.3666666666666664e-07,
"loss": 0.0,
"num_tokens": 2859183.0,
"reward": 0.48604702949523926,
"reward_std": 0.12953568994998932,
"rewards/grpo_reward_func/mean": 0.48604702949523926,
"rewards/grpo_reward_func/std": 0.12877187132835388,
"step": 230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.21875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.875,
"learning_rate": 2.3333333333333333e-07,
"loss": 0.0,
"num_tokens": 2871631.0,
"reward": 0.49290764331817627,
"reward_std": 0.1408785730600357,
"rewards/grpo_reward_func/mean": 0.49290764331817627,
"rewards/grpo_reward_func/std": 0.16115672886371613,
"step": 231
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.25,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.4375,
"learning_rate": 2.3e-07,
"loss": 0.0,
"num_tokens": 2884087.0,
"reward": 0.389384388923645,
"reward_std": 0.08452893793582916,
"rewards/grpo_reward_func/mean": 0.389384388923645,
"rewards/grpo_reward_func/std": 0.09952805191278458,
"step": 232
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.28125,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.8125,
"learning_rate": 2.2666666666666663e-07,
"loss": 0.0,
"num_tokens": 2896567.0,
"reward": 0.42921292781829834,
"reward_std": 0.12179729342460632,
"rewards/grpo_reward_func/mean": 0.42921292781829834,
"rewards/grpo_reward_func/std": 0.14654681086540222,
"step": 233
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.3125,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.0625,
"learning_rate": 2.2333333333333332e-07,
"loss": 0.0,
"num_tokens": 2909027.0,
"reward": 0.3906250596046448,
"reward_std": 0.07476774603128433,
"rewards/grpo_reward_func/mean": 0.3906250596046448,
"rewards/grpo_reward_func/std": 0.07509444653987885,
"step": 234
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.34375,
"frac_reward_zero_std": 0.0,
"grad_norm": 15.6875,
"learning_rate": 2.1999999999999998e-07,
"loss": -0.0,
"num_tokens": 2921395.0,
"reward": 0.590385913848877,
"reward_std": 0.07703244686126709,
"rewards/grpo_reward_func/mean": 0.590385913848877,
"rewards/grpo_reward_func/std": 0.10371364653110504,
"step": 235
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.375,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.375,
"learning_rate": 2.1666666666666667e-07,
"loss": 0.0,
"num_tokens": 2933635.0,
"reward": 0.45836111903190613,
"reward_std": 0.1561897248029709,
"rewards/grpo_reward_func/mean": 0.45836111903190613,
"rewards/grpo_reward_func/std": 0.1763034164905548,
"step": 236
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.40625,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.75,
"learning_rate": 2.1333333333333334e-07,
"loss": 0.0,
"num_tokens": 2946019.0,
"reward": 0.3915758430957794,
"reward_std": 0.09102918207645416,
"rewards/grpo_reward_func/mean": 0.3915758430957794,
"rewards/grpo_reward_func/std": 0.09686020016670227,
"step": 237
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.4375,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.4375,
"learning_rate": 2.0999999999999997e-07,
"loss": 0.0,
"num_tokens": 2958339.0,
"reward": 0.5137478709220886,
"reward_std": 0.06453146040439606,
"rewards/grpo_reward_func/mean": 0.5137478709220886,
"rewards/grpo_reward_func/std": 0.08145393431186676,
"step": 238
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.46875,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.6875,
"learning_rate": 2.0666666666666666e-07,
"loss": 0.0,
"num_tokens": 2970795.0,
"reward": 0.3901534080505371,
"reward_std": 0.08677110075950623,
"rewards/grpo_reward_func/mean": 0.3901534080505371,
"rewards/grpo_reward_func/std": 0.0884600430727005,
"step": 239
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.5,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.25,
"learning_rate": 2.0333333333333333e-07,
"loss": -0.0,
"num_tokens": 2983199.0,
"reward": 0.3966296911239624,
"reward_std": 0.11454164981842041,
"rewards/grpo_reward_func/mean": 0.3966296911239624,
"rewards/grpo_reward_func/std": 0.11218782514333725,
"step": 240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.53125,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.8125,
"learning_rate": 2e-07,
"loss": -0.0,
"num_tokens": 2995671.0,
"reward": 0.4128722548484802,
"reward_std": 0.1050279289484024,
"rewards/grpo_reward_func/mean": 0.4128722548484802,
"rewards/grpo_reward_func/std": 0.15005381405353546,
"step": 241
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.5625,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.40625,
"learning_rate": 1.9666666666666665e-07,
"loss": -0.0,
"num_tokens": 3008163.0,
"reward": 0.41674578189849854,
"reward_std": 0.130544051527977,
"rewards/grpo_reward_func/mean": 0.41674578189849854,
"rewards/grpo_reward_func/std": 0.1530657559633255,
"step": 242
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.59375,
"frac_reward_zero_std": 0.0,
"grad_norm": 15.125,
"learning_rate": 1.9333333333333332e-07,
"loss": 0.0,
"num_tokens": 3020547.0,
"reward": 0.44154661893844604,
"reward_std": 0.11442729830741882,
"rewards/grpo_reward_func/mean": 0.44154661893844604,
"rewards/grpo_reward_func/std": 0.15436801314353943,
"step": 243
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.625,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.875,
"learning_rate": 1.8999999999999998e-07,
"loss": -0.0,
"num_tokens": 3032907.0,
"reward": 0.41183507442474365,
"reward_std": 0.11221058666706085,
"rewards/grpo_reward_func/mean": 0.41183507442474365,
"rewards/grpo_reward_func/std": 0.10689571499824524,
"step": 244
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.65625,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.0625,
"learning_rate": 1.8666666666666667e-07,
"loss": -0.0,
"num_tokens": 3045283.0,
"reward": 0.3969360589981079,
"reward_std": 0.13579751551151276,
"rewards/grpo_reward_func/mean": 0.3969360589981079,
"rewards/grpo_reward_func/std": 0.14742760360240936,
"step": 245
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.6875,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.5625,
"learning_rate": 1.833333333333333e-07,
"loss": 0.0,
"num_tokens": 3057703.0,
"reward": 0.3443870544433594,
"reward_std": 0.20534491539001465,
"rewards/grpo_reward_func/mean": 0.3443870544433594,
"rewards/grpo_reward_func/std": 0.19916358590126038,
"step": 246
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.71875,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.5625,
"learning_rate": 1.8e-07,
"loss": 0.0,
"num_tokens": 3070135.0,
"reward": 0.3964824378490448,
"reward_std": 0.13892269134521484,
"rewards/grpo_reward_func/mean": 0.3964824378490448,
"rewards/grpo_reward_func/std": 0.15906588733196259,
"step": 247
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.75,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.5625,
"learning_rate": 1.7666666666666666e-07,
"loss": -0.0,
"num_tokens": 3082619.0,
"reward": 0.4210782051086426,
"reward_std": 0.12285022437572479,
"rewards/grpo_reward_func/mean": 0.4210782051086426,
"rewards/grpo_reward_func/std": 0.1384182870388031,
"step": 248
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.78125,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.9375,
"learning_rate": 1.7333333333333332e-07,
"loss": -0.0,
"num_tokens": 3095171.0,
"reward": 0.49818533658981323,
"reward_std": 0.10502855479717255,
"rewards/grpo_reward_func/mean": 0.49818533658981323,
"rewards/grpo_reward_func/std": 0.13689859211444855,
"step": 249
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.8125,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.6875,
"learning_rate": 1.7000000000000001e-07,
"loss": -0.0,
"num_tokens": 3107575.0,
"reward": 0.39785051345825195,
"reward_std": 0.0737057775259018,
"rewards/grpo_reward_func/mean": 0.39785051345825195,
"rewards/grpo_reward_func/std": 0.08374593406915665,
"step": 250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.84375,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.1875,
"learning_rate": 1.6666666666666665e-07,
"loss": -0.0,
"num_tokens": 3119987.0,
"reward": 0.3956165909767151,
"reward_std": 0.08730382472276688,
"rewards/grpo_reward_func/mean": 0.3956165909767151,
"rewards/grpo_reward_func/std": 0.12590822577476501,
"step": 251
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.875,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.125,
"learning_rate": 1.6333333333333331e-07,
"loss": -0.0,
"num_tokens": 3132419.0,
"reward": 0.40047013759613037,
"reward_std": 0.09308422356843948,
"rewards/grpo_reward_func/mean": 0.40047013759613037,
"rewards/grpo_reward_func/std": 0.10088325291872025,
"step": 252
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.90625,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.125,
"learning_rate": 1.6e-07,
"loss": -0.0,
"num_tokens": 3144851.0,
"reward": 0.33004331588745117,
"reward_std": 0.04140020161867142,
"rewards/grpo_reward_func/mean": 0.33004331588745117,
"rewards/grpo_reward_func/std": 0.04383409395813942,
"step": 253
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.9375,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.4375,
"learning_rate": 1.5666666666666667e-07,
"loss": -0.0,
"num_tokens": 3157367.0,
"reward": 0.495669424533844,
"reward_std": 0.095655158162117,
"rewards/grpo_reward_func/mean": 0.495669424533844,
"rewards/grpo_reward_func/std": 0.10840737819671631,
"step": 254
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 7.96875,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.3125,
"learning_rate": 1.533333333333333e-07,
"loss": 0.0,
"num_tokens": 3169795.0,
"reward": 0.398048460483551,
"reward_std": 0.08092916011810303,
"rewards/grpo_reward_func/mean": 0.398048460483551,
"rewards/grpo_reward_func/std": 0.08040700852870941,
"step": 255
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.0625,
"learning_rate": 1.5e-07,
"loss": 0.0,
"num_tokens": 3182271.0,
"reward": 0.4666450321674347,
"reward_std": 0.08053655922412872,
"rewards/grpo_reward_func/mean": 0.4666450321674347,
"rewards/grpo_reward_func/std": 0.11888416111469269,
"step": 256
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.03125,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.4375,
"learning_rate": 1.4666666666666666e-07,
"loss": 0.0,
"num_tokens": 3194699.0,
"reward": 0.4015364646911621,
"reward_std": 0.16598042845726013,
"rewards/grpo_reward_func/mean": 0.4015364646911621,
"rewards/grpo_reward_func/std": 0.16788989305496216,
"step": 257
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.0625,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.375,
"learning_rate": 1.4333333333333335e-07,
"loss": -0.0,
"num_tokens": 3207091.0,
"reward": 0.48480066657066345,
"reward_std": 0.15683354437351227,
"rewards/grpo_reward_func/mean": 0.48480066657066345,
"rewards/grpo_reward_func/std": 0.14960500597953796,
"step": 258
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.09375,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.8125,
"learning_rate": 1.4e-07,
"loss": 0.0,
"num_tokens": 3219447.0,
"reward": 0.49088042974472046,
"reward_std": 0.16376182436943054,
"rewards/grpo_reward_func/mean": 0.49088042974472046,
"rewards/grpo_reward_func/std": 0.17037776112556458,
"step": 259
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.125,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.6875,
"learning_rate": 1.3666666666666665e-07,
"loss": 0.0,
"num_tokens": 3231843.0,
"reward": 0.4621606469154358,
"reward_std": 0.16308224201202393,
"rewards/grpo_reward_func/mean": 0.4621606469154358,
"rewards/grpo_reward_func/std": 0.18942511081695557,
"step": 260
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.15625,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.375,
"learning_rate": 1.3333333333333334e-07,
"loss": 0.0,
"num_tokens": 3244199.0,
"reward": 0.521634578704834,
"reward_std": 0.08799108862876892,
"rewards/grpo_reward_func/mean": 0.521634578704834,
"rewards/grpo_reward_func/std": 0.08898300677537918,
"step": 261
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.1875,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.375,
"learning_rate": 1.3e-07,
"loss": -0.0,
"num_tokens": 3256663.0,
"reward": 0.5014014840126038,
"reward_std": 0.10305628925561905,
"rewards/grpo_reward_func/mean": 0.5014014840126038,
"rewards/grpo_reward_func/std": 0.11243268102407455,
"step": 262
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.21875,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.25,
"learning_rate": 1.2666666666666666e-07,
"loss": 0.0,
"num_tokens": 3269115.0,
"reward": 0.49657315015792847,
"reward_std": 0.14654701948165894,
"rewards/grpo_reward_func/mean": 0.49657315015792847,
"rewards/grpo_reward_func/std": 0.14595918357372284,
"step": 263
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.25,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.75,
"learning_rate": 1.2333333333333333e-07,
"loss": 0.0,
"num_tokens": 3281535.0,
"reward": 0.4041872024536133,
"reward_std": 0.1379416286945343,
"rewards/grpo_reward_func/mean": 0.4041872024536133,
"rewards/grpo_reward_func/std": 0.1561095267534256,
"step": 264
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.28125,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.5,
"learning_rate": 1.2e-07,
"loss": -0.0,
"num_tokens": 3293927.0,
"reward": 0.5414110422134399,
"reward_std": 0.1973114013671875,
"rewards/grpo_reward_func/mean": 0.5414110422134399,
"rewards/grpo_reward_func/std": 0.18588195741176605,
"step": 265
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.3125,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.5,
"learning_rate": 1.1666666666666667e-07,
"loss": -0.0,
"num_tokens": 3306379.0,
"reward": 0.3934594988822937,
"reward_std": 0.025219213217496872,
"rewards/grpo_reward_func/mean": 0.3934594988822937,
"rewards/grpo_reward_func/std": 0.027134951204061508,
"step": 266
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.34375,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.4375,
"learning_rate": 1.1333333333333332e-07,
"loss": -0.0,
"num_tokens": 3318815.0,
"reward": 0.40915048122406006,
"reward_std": 0.09651514887809753,
"rewards/grpo_reward_func/mean": 0.40915048122406006,
"rewards/grpo_reward_func/std": 0.11164474487304688,
"step": 267
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.375,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.3125,
"learning_rate": 1.0999999999999999e-07,
"loss": 0.0,
"num_tokens": 3331207.0,
"reward": 0.3795730471611023,
"reward_std": 0.08440607786178589,
"rewards/grpo_reward_func/mean": 0.3795730471611023,
"rewards/grpo_reward_func/std": 0.08232571184635162,
"step": 268
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.40625,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.125,
"learning_rate": 1.0666666666666667e-07,
"loss": -0.0,
"num_tokens": 3343791.0,
"reward": 0.45081427693367004,
"reward_std": 0.13623002171516418,
"rewards/grpo_reward_func/mean": 0.45081427693367004,
"rewards/grpo_reward_func/std": 0.14548543095588684,
"step": 269
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.4375,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.0625,
"learning_rate": 1.0333333333333333e-07,
"loss": -0.0,
"num_tokens": 3356207.0,
"reward": 0.46069252490997314,
"reward_std": 0.07286226749420166,
"rewards/grpo_reward_func/mean": 0.46069252490997314,
"rewards/grpo_reward_func/std": 0.08740860968828201,
"step": 270
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.46875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.875,
"learning_rate": 1e-07,
"loss": 0.0,
"num_tokens": 3368691.0,
"reward": 0.37001582980155945,
"reward_std": 0.08882021903991699,
"rewards/grpo_reward_func/mean": 0.37001582980155945,
"rewards/grpo_reward_func/std": 0.08371038734912872,
"step": 271
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.5,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.625,
"learning_rate": 9.666666666666666e-08,
"loss": -0.0,
"num_tokens": 3380935.0,
"reward": 0.46963435411453247,
"reward_std": 0.12529392540454865,
"rewards/grpo_reward_func/mean": 0.46963435411453247,
"rewards/grpo_reward_func/std": 0.13837039470672607,
"step": 272
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.53125,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.3125,
"learning_rate": 9.333333333333334e-08,
"loss": -0.0,
"num_tokens": 3393443.0,
"reward": 0.5679957866668701,
"reward_std": 0.08565768599510193,
"rewards/grpo_reward_func/mean": 0.5679957866668701,
"rewards/grpo_reward_func/std": 0.08279130607843399,
"step": 273
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.5625,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.25,
"learning_rate": 9e-08,
"loss": 0.0,
"num_tokens": 3405771.0,
"reward": 0.3129928410053253,
"reward_std": 0.07984557747840881,
"rewards/grpo_reward_func/mean": 0.3129928410053253,
"rewards/grpo_reward_func/std": 0.08136677742004395,
"step": 274
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.59375,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.0625,
"learning_rate": 8.666666666666666e-08,
"loss": 0.0,
"num_tokens": 3418243.0,
"reward": 0.3354572653770447,
"reward_std": 0.09963542222976685,
"rewards/grpo_reward_func/mean": 0.3354572653770447,
"rewards/grpo_reward_func/std": 0.09654007852077484,
"step": 275
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.625,
"frac_reward_zero_std": 0.0,
"grad_norm": 15.25,
"learning_rate": 8.333333333333333e-08,
"loss": 0.0,
"num_tokens": 3430691.0,
"reward": 0.41226309537887573,
"reward_std": 0.1296028345823288,
"rewards/grpo_reward_func/mean": 0.41226309537887573,
"rewards/grpo_reward_func/std": 0.12655113637447357,
"step": 276
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.65625,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.125,
"learning_rate": 8e-08,
"loss": 0.0,
"num_tokens": 3443151.0,
"reward": 0.4148029088973999,
"reward_std": 0.1445026993751526,
"rewards/grpo_reward_func/mean": 0.4148029088973999,
"rewards/grpo_reward_func/std": 0.1527920663356781,
"step": 277
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.6875,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.1875,
"learning_rate": 7.666666666666665e-08,
"loss": 0.0,
"num_tokens": 3455579.0,
"reward": 0.3091464638710022,
"reward_std": 0.09873013943433762,
"rewards/grpo_reward_func/mean": 0.3091464638710022,
"rewards/grpo_reward_func/std": 0.12618468701839447,
"step": 278
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.71875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.375,
"learning_rate": 7.333333333333333e-08,
"loss": 0.0,
"num_tokens": 3468011.0,
"reward": 0.412067174911499,
"reward_std": 0.11878905445337296,
"rewards/grpo_reward_func/mean": 0.412067174911499,
"rewards/grpo_reward_func/std": 0.12399723380804062,
"step": 279
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.75,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.6875,
"learning_rate": 7e-08,
"loss": -0.0,
"num_tokens": 3480459.0,
"reward": 0.3863711953163147,
"reward_std": 0.1872004270553589,
"rewards/grpo_reward_func/mean": 0.3863711953163147,
"rewards/grpo_reward_func/std": 0.18860581517219543,
"step": 280
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.78125,
"frac_reward_zero_std": 0.0,
"grad_norm": 16.125,
"learning_rate": 6.666666666666667e-08,
"loss": -0.0,
"num_tokens": 3492923.0,
"reward": 0.40867847204208374,
"reward_std": 0.14625820517539978,
"rewards/grpo_reward_func/mean": 0.40867847204208374,
"rewards/grpo_reward_func/std": 0.14255009591579437,
"step": 281
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.8125,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.875,
"learning_rate": 6.333333333333333e-08,
"loss": 0.0,
"num_tokens": 3505387.0,
"reward": 0.45097100734710693,
"reward_std": 0.21717840433120728,
"rewards/grpo_reward_func/mean": 0.45097100734710693,
"rewards/grpo_reward_func/std": 0.20403653383255005,
"step": 282
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.84375,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.1875,
"learning_rate": 6e-08,
"loss": -0.0,
"num_tokens": 3517903.0,
"reward": 0.49071210622787476,
"reward_std": 0.13102422654628754,
"rewards/grpo_reward_func/mean": 0.49071210622787476,
"rewards/grpo_reward_func/std": 0.14358305931091309,
"step": 283
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.0,
"learning_rate": 5.666666666666666e-08,
"loss": -0.0,
"num_tokens": 3530331.0,
"reward": 0.471984326839447,
"reward_std": 0.11608313769102097,
"rewards/grpo_reward_func/mean": 0.471984326839447,
"rewards/grpo_reward_func/std": 0.12841607630252838,
"step": 284
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.90625,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.1875,
"learning_rate": 5.3333333333333334e-08,
"loss": -0.0,
"num_tokens": 3542763.0,
"reward": 0.39699018001556396,
"reward_std": 0.11195935308933258,
"rewards/grpo_reward_func/mean": 0.39699018001556396,
"rewards/grpo_reward_func/std": 0.16198311746120453,
"step": 285
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.9375,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.9375,
"learning_rate": 5e-08,
"loss": 0.0,
"num_tokens": 3555171.0,
"reward": 0.40294522047042847,
"reward_std": 0.11233559250831604,
"rewards/grpo_reward_func/mean": 0.40294522047042847,
"rewards/grpo_reward_func/std": 0.12867507338523865,
"step": 286
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 8.96875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.125,
"learning_rate": 4.666666666666667e-08,
"loss": 0.0,
"num_tokens": 3567639.0,
"reward": 0.45153820514678955,
"reward_std": 0.10483110696077347,
"rewards/grpo_reward_func/mean": 0.45153820514678955,
"rewards/grpo_reward_func/std": 0.11334265768527985,
"step": 287
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 9.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.9375,
"learning_rate": 4.333333333333333e-08,
"loss": 0.0,
"num_tokens": 3580055.0,
"reward": 0.39824116230010986,
"reward_std": 0.0965305045247078,
"rewards/grpo_reward_func/mean": 0.39824116230010986,
"rewards/grpo_reward_func/std": 0.10601532459259033,
"step": 288
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 9.03125,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.8125,
"learning_rate": 4e-08,
"loss": -0.0,
"num_tokens": 3592511.0,
"reward": 0.3396638035774231,
"reward_std": 0.0737166702747345,
"rewards/grpo_reward_func/mean": 0.3396638035774231,
"rewards/grpo_reward_func/std": 0.07909521460533142,
"step": 289
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 9.0625,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.0625,
"learning_rate": 3.6666666666666664e-08,
"loss": -0.0,
"num_tokens": 3604847.0,
"reward": 0.4459681212902069,
"reward_std": 0.13664312660694122,
"rewards/grpo_reward_func/mean": 0.4459681212902069,
"rewards/grpo_reward_func/std": 0.1500515192747116,
"step": 290
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 9.09375,
"frac_reward_zero_std": 0.0,
"grad_norm": 16.625,
"learning_rate": 3.3333333333333334e-08,
"loss": -0.0,
"num_tokens": 3617299.0,
"reward": 0.35913753509521484,
"reward_std": 0.10111263394355774,
"rewards/grpo_reward_func/mean": 0.35913753509521484,
"rewards/grpo_reward_func/std": 0.10508442670106888,
"step": 291
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 9.125,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.5625,
"learning_rate": 3e-08,
"loss": -0.0,
"num_tokens": 3629567.0,
"reward": 0.4349736273288727,
"reward_std": 0.12172282487154007,
"rewards/grpo_reward_func/mean": 0.4349736273288727,
"rewards/grpo_reward_func/std": 0.11470159143209457,
"step": 292
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 9.15625,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.0625,
"learning_rate": 2.6666666666666667e-08,
"loss": 0.0,
"num_tokens": 3642071.0,
"reward": 0.396597683429718,
"reward_std": 0.12911826372146606,
"rewards/grpo_reward_func/mean": 0.396597683429718,
"rewards/grpo_reward_func/std": 0.12233106046915054,
"step": 293
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 9.1875,
"frac_reward_zero_std": 0.0,
"grad_norm": 17.5,
"learning_rate": 2.3333333333333334e-08,
"loss": 0.0,
"num_tokens": 3654479.0,
"reward": 0.5098578929901123,
"reward_std": 0.1227826401591301,
"rewards/grpo_reward_func/mean": 0.5098578929901123,
"rewards/grpo_reward_func/std": 0.11480940878391266,
"step": 294
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 9.21875,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.5625,
"learning_rate": 2e-08,
"loss": 0.0,
"num_tokens": 3666891.0,
"reward": 0.40734565258026123,
"reward_std": 0.11240965127944946,
"rewards/grpo_reward_func/mean": 0.40734565258026123,
"rewards/grpo_reward_func/std": 0.13429103791713715,
"step": 295
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 9.25,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.21875,
"learning_rate": 1.6666666666666667e-08,
"loss": -0.0,
"num_tokens": 3679471.0,
"reward": 0.37585046887397766,
"reward_std": 0.048339828848838806,
"rewards/grpo_reward_func/mean": 0.37585046887397766,
"rewards/grpo_reward_func/std": 0.059352707117795944,
"step": 296
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 9.28125,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.4375,
"learning_rate": 1.3333333333333334e-08,
"loss": 0.0,
"num_tokens": 3691927.0,
"reward": 0.3830341100692749,
"reward_std": 0.09623756259679794,
"rewards/grpo_reward_func/mean": 0.3830341100692749,
"rewards/grpo_reward_func/std": 0.0935094878077507,
"step": 297
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 9.3125,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.9375,
"learning_rate": 1e-08,
"loss": 0.0,
"num_tokens": 3704431.0,
"reward": 0.5307860374450684,
"reward_std": 0.15707515180110931,
"rewards/grpo_reward_func/mean": 0.5307860374450684,
"rewards/grpo_reward_func/std": 0.15192177891731262,
"step": 298
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 9.34375,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.4375,
"learning_rate": 6.666666666666667e-09,
"loss": 0.0,
"num_tokens": 3716835.0,
"reward": 0.5075388550758362,
"reward_std": 0.13507473468780518,
"rewards/grpo_reward_func/mean": 0.5075388550758362,
"rewards/grpo_reward_func/std": 0.19023331999778748,
"step": 299
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 9.375,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.0,
"learning_rate": 3.3333333333333334e-09,
"loss": -0.0,
"num_tokens": 3729299.0,
"reward": 0.4454175531864166,
"reward_std": 0.07014341652393341,
"rewards/grpo_reward_func/mean": 0.4454175531864166,
"rewards/grpo_reward_func/std": 0.1258506029844284,
"step": 300
}
],
"logging_steps": 1,
"max_steps": 300,
"num_input_tokens_seen": 3729299,
"num_train_epochs": 10,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}