15635 lines
540 KiB
JSON
15635 lines
540 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 9.375,
|
|
"eval_steps": 500,
|
|
"global_step": 600,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.015625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.90625,
|
|
"kl": 0.0,
|
|
"learning_rate": 0.0,
|
|
"loss": -0.0,
|
|
"num_tokens": 25104.0,
|
|
"reward": 0.38830292224884033,
|
|
"reward_std": 0.0850929468870163,
|
|
"rewards/grpo_reward_func/mean": 0.38830292224884033,
|
|
"rewards/grpo_reward_func/std": 0.08739857375621796,
|
|
"step": 1
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.03125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.875,
|
|
"kl": 0.0,
|
|
"learning_rate": 8.333333333333334e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 50064.0,
|
|
"reward": 0.38852280378341675,
|
|
"reward_std": 0.14650246500968933,
|
|
"rewards/grpo_reward_func/mean": 0.38852280378341675,
|
|
"rewards/grpo_reward_func/std": 0.1692417562007904,
|
|
"step": 2
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.046875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.75,
|
|
"kl": 0.0006874206592328846,
|
|
"learning_rate": 1.6666666666666667e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 75464.0,
|
|
"reward": 0.2590749263763428,
|
|
"reward_std": 0.11121661216020584,
|
|
"rewards/grpo_reward_func/mean": 0.2590749263763428,
|
|
"rewards/grpo_reward_func/std": 0.1687185913324356,
|
|
"step": 3
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.25,
|
|
"kl": 0.0003259268924011849,
|
|
"learning_rate": 2.5e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 100632.0,
|
|
"reward": 0.3599267899990082,
|
|
"reward_std": 0.1382569521665573,
|
|
"rewards/grpo_reward_func/mean": 0.3599267899990082,
|
|
"rewards/grpo_reward_func/std": 0.15764164924621582,
|
|
"step": 4
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.078125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.4375,
|
|
"kl": 0.00010777699208119884,
|
|
"learning_rate": 3.3333333333333334e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 125584.0,
|
|
"reward": 0.4119647145271301,
|
|
"reward_std": 0.10570582747459412,
|
|
"rewards/grpo_reward_func/mean": 0.4119647145271301,
|
|
"rewards/grpo_reward_func/std": 0.14051103591918945,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.09375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.0,
|
|
"kl": 0.0003376482127350755,
|
|
"learning_rate": 4.166666666666666e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 150800.0,
|
|
"reward": 0.2505754232406616,
|
|
"reward_std": 0.1554991602897644,
|
|
"rewards/grpo_reward_func/mean": 0.2505754232406616,
|
|
"rewards/grpo_reward_func/std": 0.15568533539772034,
|
|
"step": 6
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.109375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.3125,
|
|
"kl": 0.00025600909430067986,
|
|
"learning_rate": 5e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 176576.0,
|
|
"reward": 0.06775141507387161,
|
|
"reward_std": 0.14545553922653198,
|
|
"rewards/grpo_reward_func/mean": 0.06775141507387161,
|
|
"rewards/grpo_reward_func/std": 0.16305957734584808,
|
|
"step": 7
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.0625,
|
|
"kl": 0.00029405950772343203,
|
|
"learning_rate": 5.833333333333333e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 202072.0,
|
|
"reward": 0.11841653287410736,
|
|
"reward_std": 0.0738542377948761,
|
|
"rewards/grpo_reward_func/mean": 0.11841653287410736,
|
|
"rewards/grpo_reward_func/std": 0.07645151019096375,
|
|
"step": 8
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.140625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.4375,
|
|
"kl": 0.0002696753217605874,
|
|
"learning_rate": 6.666666666666667e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 227912.0,
|
|
"reward": 0.10485261678695679,
|
|
"reward_std": 0.09079232811927795,
|
|
"rewards/grpo_reward_func/mean": 0.10485261678695679,
|
|
"rewards/grpo_reward_func/std": 0.08799877762794495,
|
|
"step": 9
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.15625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 19.125,
|
|
"kl": 0.0005563851591432467,
|
|
"learning_rate": 7.5e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 252904.0,
|
|
"reward": 0.3231259882450104,
|
|
"reward_std": 0.16560493409633636,
|
|
"rewards/grpo_reward_func/mean": 0.3231259882450104,
|
|
"rewards/grpo_reward_func/std": 0.1688951849937439,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.171875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.125,
|
|
"kl": 7.444247421517503e-05,
|
|
"learning_rate": 8.333333333333333e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 278576.0,
|
|
"reward": 0.214687317609787,
|
|
"reward_std": 0.0769403874874115,
|
|
"rewards/grpo_reward_func/mean": 0.214687317609787,
|
|
"rewards/grpo_reward_func/std": 0.2403011918067932,
|
|
"step": 11
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.8125,
|
|
"kl": 0.00032506883871974424,
|
|
"learning_rate": 9.166666666666665e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 303920.0,
|
|
"reward": 0.20663060247898102,
|
|
"reward_std": 0.10999321937561035,
|
|
"rewards/grpo_reward_func/mean": 0.20663060247898102,
|
|
"rewards/grpo_reward_func/std": 0.15555834770202637,
|
|
"step": 12
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.203125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.1875,
|
|
"kl": 0.00023091987532097846,
|
|
"learning_rate": 1e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 329024.0,
|
|
"reward": 0.36360427737236023,
|
|
"reward_std": 0.15357878804206848,
|
|
"rewards/grpo_reward_func/mean": 0.36360427737236023,
|
|
"rewards/grpo_reward_func/std": 0.22707244753837585,
|
|
"step": 13
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.21875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.6875,
|
|
"kl": 0.00046382473010453396,
|
|
"learning_rate": 1.0833333333333334e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 353760.0,
|
|
"reward": 0.3822150230407715,
|
|
"reward_std": 0.11560969054698944,
|
|
"rewards/grpo_reward_func/mean": 0.3822150230407715,
|
|
"rewards/grpo_reward_func/std": 0.11733639240264893,
|
|
"step": 14
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.234375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.0,
|
|
"kl": 0.00023804418742656708,
|
|
"learning_rate": 1.1666666666666667e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 379000.0,
|
|
"reward": 0.17802061140537262,
|
|
"reward_std": 0.08576580137014389,
|
|
"rewards/grpo_reward_func/mean": 0.17802061140537262,
|
|
"rewards/grpo_reward_func/std": 0.1571587771177292,
|
|
"step": 15
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.25,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.1875,
|
|
"kl": 9.951802576324553e-05,
|
|
"learning_rate": 1.25e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 404232.0,
|
|
"reward": 0.1350761502981186,
|
|
"reward_std": 0.07875347137451172,
|
|
"rewards/grpo_reward_func/mean": 0.1350761502981186,
|
|
"rewards/grpo_reward_func/std": 0.14856529235839844,
|
|
"step": 16
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.265625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.9375,
|
|
"kl": 0.0003588832914829254,
|
|
"learning_rate": 1.3333333333333334e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 429392.0,
|
|
"reward": 0.25521713495254517,
|
|
"reward_std": 0.17234337329864502,
|
|
"rewards/grpo_reward_func/mean": 0.25521713495254517,
|
|
"rewards/grpo_reward_func/std": 0.24440135061740875,
|
|
"step": 17
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.28125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.625,
|
|
"kl": 0.000218976605538046,
|
|
"learning_rate": 1.4166666666666665e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 454352.0,
|
|
"reward": 0.3575406074523926,
|
|
"reward_std": 0.1384538859128952,
|
|
"rewards/grpo_reward_func/mean": 0.3575406074523926,
|
|
"rewards/grpo_reward_func/std": 0.13475467264652252,
|
|
"step": 18
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.296875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.4375,
|
|
"kl": 0.00021025817841291428,
|
|
"learning_rate": 1.5e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 480208.0,
|
|
"reward": 0.09945888817310333,
|
|
"reward_std": 0.09086775779724121,
|
|
"rewards/grpo_reward_func/mean": 0.09945888817310333,
|
|
"rewards/grpo_reward_func/std": 0.10034166276454926,
|
|
"step": 19
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.3125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 18.625,
|
|
"kl": 0.00023360302293440327,
|
|
"learning_rate": 1.583333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 506432.0,
|
|
"reward": 0.08866722881793976,
|
|
"reward_std": 0.11887718737125397,
|
|
"rewards/grpo_reward_func/mean": 0.08866722881793976,
|
|
"rewards/grpo_reward_func/std": 0.12845923006534576,
|
|
"step": 20
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.328125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 17.625,
|
|
"kl": 0.0002354470343561843,
|
|
"learning_rate": 1.6666666666666665e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 532456.0,
|
|
"reward": 0.06954602897167206,
|
|
"reward_std": 0.11671130359172821,
|
|
"rewards/grpo_reward_func/mean": 0.06954602897167206,
|
|
"rewards/grpo_reward_func/std": 0.11377867311239243,
|
|
"step": 21
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.34375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.0,
|
|
"kl": 0.0003678910434246063,
|
|
"learning_rate": 1.75e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 557416.0,
|
|
"reward": 0.31994929909706116,
|
|
"reward_std": 0.11059385538101196,
|
|
"rewards/grpo_reward_func/mean": 0.31994929909706116,
|
|
"rewards/grpo_reward_func/std": 0.1296655386686325,
|
|
"step": 22
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.359375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.5,
|
|
"kl": 0.00020401241636136547,
|
|
"learning_rate": 1.833333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 582720.0,
|
|
"reward": 0.3206818401813507,
|
|
"reward_std": 0.10244449228048325,
|
|
"rewards/grpo_reward_func/mean": 0.3206818401813507,
|
|
"rewards/grpo_reward_func/std": 0.17895404994487762,
|
|
"step": 23
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.8125,
|
|
"kl": 0.00020465596026042476,
|
|
"learning_rate": 1.9166666666666668e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 608040.0,
|
|
"reward": 0.43054676055908203,
|
|
"reward_std": 0.18113180994987488,
|
|
"rewards/grpo_reward_func/mean": 0.43054676055908203,
|
|
"rewards/grpo_reward_func/std": 0.2842019498348236,
|
|
"step": 24
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.390625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.6875,
|
|
"kl": 0.00022962906950851902,
|
|
"learning_rate": 2e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 633024.0,
|
|
"reward": 0.28427720069885254,
|
|
"reward_std": 0.099380724132061,
|
|
"rewards/grpo_reward_func/mean": 0.28427720069885254,
|
|
"rewards/grpo_reward_func/std": 0.11085890978574753,
|
|
"step": 25
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.40625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.4375,
|
|
"kl": 0.00015032757073640823,
|
|
"learning_rate": 2.0833333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 657928.0,
|
|
"reward": 0.3395322561264038,
|
|
"reward_std": 0.09541542828083038,
|
|
"rewards/grpo_reward_func/mean": 0.3395322561264038,
|
|
"rewards/grpo_reward_func/std": 0.11023792624473572,
|
|
"step": 26
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.421875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.875,
|
|
"kl": 8.121753853629343e-05,
|
|
"learning_rate": 2.1666666666666667e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 682808.0,
|
|
"reward": 0.26600906252861023,
|
|
"reward_std": 0.08304192125797272,
|
|
"rewards/grpo_reward_func/mean": 0.26600906252861023,
|
|
"rewards/grpo_reward_func/std": 0.09309838712215424,
|
|
"step": 27
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.4375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.5,
|
|
"kl": 0.00022083613293943927,
|
|
"learning_rate": 2.25e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 708184.0,
|
|
"reward": 0.23112675547599792,
|
|
"reward_std": 0.08039335906505585,
|
|
"rewards/grpo_reward_func/mean": 0.23112675547599792,
|
|
"rewards/grpo_reward_func/std": 0.20841825008392334,
|
|
"step": 28
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.453125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.25,
|
|
"kl": 0.0002448335289955139,
|
|
"learning_rate": 2.3333333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 733168.0,
|
|
"reward": 0.3006294071674347,
|
|
"reward_std": 0.13648909330368042,
|
|
"rewards/grpo_reward_func/mean": 0.3006294071674347,
|
|
"rewards/grpo_reward_func/std": 0.1425219625234604,
|
|
"step": 29
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.46875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.5625,
|
|
"kl": 0.0002552429141360335,
|
|
"learning_rate": 2.4166666666666665e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 758064.0,
|
|
"reward": 0.26793670654296875,
|
|
"reward_std": 0.12509800493717194,
|
|
"rewards/grpo_reward_func/mean": 0.26793670654296875,
|
|
"rewards/grpo_reward_func/std": 0.15831595659255981,
|
|
"step": 30
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.484375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.5,
|
|
"kl": 0.00015467405319213867,
|
|
"learning_rate": 2.5e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 783520.0,
|
|
"reward": 0.07176833599805832,
|
|
"reward_std": 0.12234357744455338,
|
|
"rewards/grpo_reward_func/mean": 0.07176833599805832,
|
|
"rewards/grpo_reward_func/std": 0.12346605211496353,
|
|
"step": 31
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.5,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.625,
|
|
"kl": 0.0002303921282873489,
|
|
"learning_rate": 2.5833333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 809312.0,
|
|
"reward": 0.04103652387857437,
|
|
"reward_std": 0.08447092771530151,
|
|
"rewards/grpo_reward_func/mean": 0.04103652387857437,
|
|
"rewards/grpo_reward_func/std": 0.08558139950037003,
|
|
"step": 32
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.515625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.3125,
|
|
"kl": 0.00012425271415850148,
|
|
"learning_rate": 2.6666666666666667e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 834584.0,
|
|
"reward": 0.24062111973762512,
|
|
"reward_std": 0.11993544548749924,
|
|
"rewards/grpo_reward_func/mean": 0.24062111973762512,
|
|
"rewards/grpo_reward_func/std": 0.1465650498867035,
|
|
"step": 33
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.53125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.3125,
|
|
"kl": 0.0003296689537819475,
|
|
"learning_rate": 2.75e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 859728.0,
|
|
"reward": 0.31566762924194336,
|
|
"reward_std": 0.15053875744342804,
|
|
"rewards/grpo_reward_func/mean": 0.31566762924194336,
|
|
"rewards/grpo_reward_func/std": 0.17469221353530884,
|
|
"step": 34
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.546875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.625,
|
|
"kl": 0.00011381755030015483,
|
|
"learning_rate": 2.833333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 884728.0,
|
|
"reward": 0.34107768535614014,
|
|
"reward_std": 0.14179188013076782,
|
|
"rewards/grpo_reward_func/mean": 0.34107768535614014,
|
|
"rewards/grpo_reward_func/std": 0.14815190434455872,
|
|
"step": 35
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.5625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.125,
|
|
"kl": 0.00012111260366509669,
|
|
"learning_rate": 2.916666666666667e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 909552.0,
|
|
"reward": 0.454355925321579,
|
|
"reward_std": 0.14741826057434082,
|
|
"rewards/grpo_reward_func/mean": 0.454355925321579,
|
|
"rewards/grpo_reward_func/std": 0.14277252554893494,
|
|
"step": 36
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.578125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.6875,
|
|
"kl": 0.00044129292655270547,
|
|
"learning_rate": 3e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 934856.0,
|
|
"reward": 0.23417434096336365,
|
|
"reward_std": 0.07929708808660507,
|
|
"rewards/grpo_reward_func/mean": 0.23417434096336365,
|
|
"rewards/grpo_reward_func/std": 0.11618053168058395,
|
|
"step": 37
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.59375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.8125,
|
|
"kl": 0.000225259609578643,
|
|
"learning_rate": 3.0833333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 960368.0,
|
|
"reward": 0.22501924633979797,
|
|
"reward_std": 0.1290852576494217,
|
|
"rewards/grpo_reward_func/mean": 0.22501924633979797,
|
|
"rewards/grpo_reward_func/std": 0.23068629205226898,
|
|
"step": 38
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.609375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.0,
|
|
"kl": 0.00030248487746575847,
|
|
"learning_rate": 3.166666666666666e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 985632.0,
|
|
"reward": 0.3790042996406555,
|
|
"reward_std": 0.16399240493774414,
|
|
"rewards/grpo_reward_func/mean": 0.3790042996406555,
|
|
"rewards/grpo_reward_func/std": 0.20435301959514618,
|
|
"step": 39
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.0,
|
|
"kl": 0.0002346886321902275,
|
|
"learning_rate": 3.25e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1010840.0,
|
|
"reward": 0.3319600224494934,
|
|
"reward_std": 0.08543172478675842,
|
|
"rewards/grpo_reward_func/mean": 0.3319600224494934,
|
|
"rewards/grpo_reward_func/std": 0.11251801252365112,
|
|
"step": 40
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.640625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.9375,
|
|
"kl": 0.00011254071068833582,
|
|
"learning_rate": 3.333333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1035528.0,
|
|
"reward": 0.5335917472839355,
|
|
"reward_std": 0.09484530240297318,
|
|
"rewards/grpo_reward_func/mean": 0.5335917472839355,
|
|
"rewards/grpo_reward_func/std": 0.12994690239429474,
|
|
"step": 41
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.65625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.0625,
|
|
"kl": 0.0003100304602412507,
|
|
"learning_rate": 3.4166666666666664e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1061056.0,
|
|
"reward": 0.2819344103336334,
|
|
"reward_std": 0.09613234549760818,
|
|
"rewards/grpo_reward_func/mean": 0.2819344103336334,
|
|
"rewards/grpo_reward_func/std": 0.17606547474861145,
|
|
"step": 42
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.671875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.25,
|
|
"kl": 0.00022800049191573635,
|
|
"learning_rate": 3.5e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1085800.0,
|
|
"reward": 0.3839090168476105,
|
|
"reward_std": 0.1359642744064331,
|
|
"rewards/grpo_reward_func/mean": 0.3839090168476105,
|
|
"rewards/grpo_reward_func/std": 0.14048616588115692,
|
|
"step": 43
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.6875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.8125,
|
|
"kl": 0.0002025471330853179,
|
|
"learning_rate": 3.583333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1110848.0,
|
|
"reward": 0.30948373675346375,
|
|
"reward_std": 0.06383931636810303,
|
|
"rewards/grpo_reward_func/mean": 0.30948373675346375,
|
|
"rewards/grpo_reward_func/std": 0.07188048213720322,
|
|
"step": 44
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.703125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.25,
|
|
"kl": 0.00023324073845287785,
|
|
"learning_rate": 3.666666666666666e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1136088.0,
|
|
"reward": 0.3444882035255432,
|
|
"reward_std": 0.09875836968421936,
|
|
"rewards/grpo_reward_func/mean": 0.3444882035255432,
|
|
"rewards/grpo_reward_func/std": 0.1097148060798645,
|
|
"step": 45
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.71875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.5,
|
|
"kl": 0.00025422839826205745,
|
|
"learning_rate": 3.75e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1161032.0,
|
|
"reward": 0.2477511316537857,
|
|
"reward_std": 0.1270497590303421,
|
|
"rewards/grpo_reward_func/mean": 0.2477511316537857,
|
|
"rewards/grpo_reward_func/std": 0.15981581807136536,
|
|
"step": 46
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.734375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.5,
|
|
"kl": 0.00021969123918097466,
|
|
"learning_rate": 3.8333333333333335e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1185888.0,
|
|
"reward": 0.46379101276397705,
|
|
"reward_std": 0.14611366391181946,
|
|
"rewards/grpo_reward_func/mean": 0.46379101276397705,
|
|
"rewards/grpo_reward_func/std": 0.16491258144378662,
|
|
"step": 47
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.75,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.8125,
|
|
"kl": 0.0004819600435439497,
|
|
"learning_rate": 3.9166666666666664e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1211400.0,
|
|
"reward": 0.23074069619178772,
|
|
"reward_std": 0.14558511972427368,
|
|
"rewards/grpo_reward_func/mean": 0.23074069619178772,
|
|
"rewards/grpo_reward_func/std": 0.2206806093454361,
|
|
"step": 48
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.765625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.1875,
|
|
"kl": 6.555269101227168e-05,
|
|
"learning_rate": 4e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1237400.0,
|
|
"reward": 0.05634097009897232,
|
|
"reward_std": 0.06590264290571213,
|
|
"rewards/grpo_reward_func/mean": 0.05634097009897232,
|
|
"rewards/grpo_reward_func/std": 0.10430527478456497,
|
|
"step": 49
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.78125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.9375,
|
|
"kl": 0.0001880628988146782,
|
|
"learning_rate": 4.083333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1262712.0,
|
|
"reward": 0.2721788287162781,
|
|
"reward_std": 0.0947684645652771,
|
|
"rewards/grpo_reward_func/mean": 0.2721788287162781,
|
|
"rewards/grpo_reward_func/std": 0.12943580746650696,
|
|
"step": 50
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.796875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.8125,
|
|
"kl": 0.0001086403317458462,
|
|
"learning_rate": 4.1666666666666667e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1287648.0,
|
|
"reward": 0.4438517093658447,
|
|
"reward_std": 0.14172330498695374,
|
|
"rewards/grpo_reward_func/mean": 0.4438517093658447,
|
|
"rewards/grpo_reward_func/std": 0.1430020034313202,
|
|
"step": 51
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.8125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.8125,
|
|
"kl": 0.0003144231850455981,
|
|
"learning_rate": 4.2499999999999995e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1312272.0,
|
|
"reward": 0.299557626247406,
|
|
"reward_std": 0.11285382509231567,
|
|
"rewards/grpo_reward_func/mean": 0.299557626247406,
|
|
"rewards/grpo_reward_func/std": 0.11851979792118073,
|
|
"step": 52
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.828125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.9375,
|
|
"kl": 0.00012044112008879893,
|
|
"learning_rate": 4.3333333333333335e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1337296.0,
|
|
"reward": 0.4098794460296631,
|
|
"reward_std": 0.11588951200246811,
|
|
"rewards/grpo_reward_func/mean": 0.4098794460296631,
|
|
"rewards/grpo_reward_func/std": 0.11837032437324524,
|
|
"step": 53
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.84375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 19.25,
|
|
"kl": 0.00035638889676192775,
|
|
"learning_rate": 4.4166666666666664e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1362296.0,
|
|
"reward": 0.34147408604621887,
|
|
"reward_std": 0.08064563572406769,
|
|
"rewards/grpo_reward_func/mean": 0.34147408604621887,
|
|
"rewards/grpo_reward_func/std": 0.09037595242261887,
|
|
"step": 54
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.859375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.875,
|
|
"kl": 0.00010125773405889049,
|
|
"learning_rate": 4.5e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1387488.0,
|
|
"reward": 0.34650981426239014,
|
|
"reward_std": 0.10842345654964447,
|
|
"rewards/grpo_reward_func/mean": 0.34650981426239014,
|
|
"rewards/grpo_reward_func/std": 0.1347353607416153,
|
|
"step": 55
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.1875,
|
|
"kl": 0.00020377027249196544,
|
|
"learning_rate": 4.5833333333333327e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1412400.0,
|
|
"reward": 0.4124220311641693,
|
|
"reward_std": 0.0689394623041153,
|
|
"rewards/grpo_reward_func/mean": 0.4124220311641693,
|
|
"rewards/grpo_reward_func/std": 0.1449054330587387,
|
|
"step": 56
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.890625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.1875,
|
|
"kl": 0.00019250033437856473,
|
|
"learning_rate": 4.6666666666666666e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1437976.0,
|
|
"reward": 0.2810817062854767,
|
|
"reward_std": 0.10197813808917999,
|
|
"rewards/grpo_reward_func/mean": 0.2810817062854767,
|
|
"rewards/grpo_reward_func/std": 0.2297528237104416,
|
|
"step": 57
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.90625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.375,
|
|
"kl": 0.00023638892162125558,
|
|
"learning_rate": 4.7499999999999995e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1463016.0,
|
|
"reward": 0.4437309503555298,
|
|
"reward_std": 0.15493561327457428,
|
|
"rewards/grpo_reward_func/mean": 0.4437309503555298,
|
|
"rewards/grpo_reward_func/std": 0.16053226590156555,
|
|
"step": 58
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.921875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.1875,
|
|
"kl": 0.0003489665687084198,
|
|
"learning_rate": 4.833333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1487920.0,
|
|
"reward": 0.27588674426078796,
|
|
"reward_std": 0.09553509950637817,
|
|
"rewards/grpo_reward_func/mean": 0.27588674426078796,
|
|
"rewards/grpo_reward_func/std": 0.09245670586824417,
|
|
"step": 59
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.9375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.8125,
|
|
"kl": 0.00045755444443784654,
|
|
"learning_rate": 4.916666666666666e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1513336.0,
|
|
"reward": 0.08732398599386215,
|
|
"reward_std": 0.10541350394487381,
|
|
"rewards/grpo_reward_func/mean": 0.08732398599386215,
|
|
"rewards/grpo_reward_func/std": 0.10521270334720612,
|
|
"step": 60
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.953125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.5,
|
|
"kl": 0.000195571225049207,
|
|
"learning_rate": 5e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1538576.0,
|
|
"reward": 0.2950664162635803,
|
|
"reward_std": 0.11005343496799469,
|
|
"rewards/grpo_reward_func/mean": 0.2950664162635803,
|
|
"rewards/grpo_reward_func/std": 0.2244093418121338,
|
|
"step": 61
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.96875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.75,
|
|
"kl": 0.00013369570297072642,
|
|
"learning_rate": 4.990740740740741e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1563896.0,
|
|
"reward": 0.17187045514583588,
|
|
"reward_std": 0.06180661916732788,
|
|
"rewards/grpo_reward_func/mean": 0.17187045514583588,
|
|
"rewards/grpo_reward_func/std": 0.19724208116531372,
|
|
"step": 62
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.984375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.5,
|
|
"kl": 0.00016097879051812924,
|
|
"learning_rate": 4.981481481481482e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1589008.0,
|
|
"reward": 0.2205008864402771,
|
|
"reward_std": 0.053931236267089844,
|
|
"rewards/grpo_reward_func/mean": 0.2205008864402771,
|
|
"rewards/grpo_reward_func/std": 0.057305920869112015,
|
|
"step": 63
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.0,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.75,
|
|
"kl": 0.00014925623690942302,
|
|
"learning_rate": 4.972222222222222e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1613816.0,
|
|
"reward": 0.4431067705154419,
|
|
"reward_std": 0.14857327938079834,
|
|
"rewards/grpo_reward_func/mean": 0.4431067705154419,
|
|
"rewards/grpo_reward_func/std": 0.17278340458869934,
|
|
"step": 64
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.015625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.5,
|
|
"kl": 0.0001297382063967234,
|
|
"learning_rate": 4.962962962962963e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1639560.0,
|
|
"reward": 0.2275439351797104,
|
|
"reward_std": 0.1384904682636261,
|
|
"rewards/grpo_reward_func/mean": 0.2275439351797104,
|
|
"rewards/grpo_reward_func/std": 0.1672850400209427,
|
|
"step": 65
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.03125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.3125,
|
|
"kl": 6.904608108015964e-05,
|
|
"learning_rate": 4.953703703703703e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1665128.0,
|
|
"reward": 0.047979630529880524,
|
|
"reward_std": 0.11713965237140656,
|
|
"rewards/grpo_reward_func/mean": 0.047979630529880524,
|
|
"rewards/grpo_reward_func/std": 0.11486222594976425,
|
|
"step": 66
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.046875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.25,
|
|
"kl": 5.872082147106994e-05,
|
|
"learning_rate": 4.944444444444445e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1690152.0,
|
|
"reward": 0.29636937379837036,
|
|
"reward_std": 0.19861865043640137,
|
|
"rewards/grpo_reward_func/mean": 0.29636937379837036,
|
|
"rewards/grpo_reward_func/std": 0.20937326550483704,
|
|
"step": 67
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.0625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.625,
|
|
"kl": 0.0003152005447191186,
|
|
"learning_rate": 4.935185185185185e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1715608.0,
|
|
"reward": 0.2516125440597534,
|
|
"reward_std": 0.07490938156843185,
|
|
"rewards/grpo_reward_func/mean": 0.2516125440597534,
|
|
"rewards/grpo_reward_func/std": 0.14309823513031006,
|
|
"step": 68
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.078125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.875,
|
|
"kl": 0.0002456077709211968,
|
|
"learning_rate": 4.925925925925926e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1741720.0,
|
|
"reward": 0.11342965066432953,
|
|
"reward_std": 0.06376887857913971,
|
|
"rewards/grpo_reward_func/mean": 0.11342965066432953,
|
|
"rewards/grpo_reward_func/std": 0.11074693500995636,
|
|
"step": 69
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.09375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.0,
|
|
"kl": 4.5569922804133967e-05,
|
|
"learning_rate": 4.916666666666666e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1767576.0,
|
|
"reward": 0.054858915507793427,
|
|
"reward_std": 0.13666033744812012,
|
|
"rewards/grpo_reward_func/mean": 0.054858915507793427,
|
|
"rewards/grpo_reward_func/std": 0.169277161359787,
|
|
"step": 70
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.109375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.625,
|
|
"kl": 0.0002712399436859414,
|
|
"learning_rate": 4.907407407407407e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1792840.0,
|
|
"reward": 0.13117259740829468,
|
|
"reward_std": 0.0819777101278305,
|
|
"rewards/grpo_reward_func/mean": 0.13117259740829468,
|
|
"rewards/grpo_reward_func/std": 0.09603901952505112,
|
|
"step": 71
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 20.125,
|
|
"kl": 0.00017950467008631676,
|
|
"learning_rate": 4.898148148148148e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1818224.0,
|
|
"reward": 0.2819082736968994,
|
|
"reward_std": 0.20238551497459412,
|
|
"rewards/grpo_reward_func/mean": 0.2819082736968994,
|
|
"rewards/grpo_reward_func/std": 0.2046259194612503,
|
|
"step": 72
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.140625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.6875,
|
|
"kl": 0.00045007342123426497,
|
|
"learning_rate": 4.888888888888889e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1843328.0,
|
|
"reward": 0.30494439601898193,
|
|
"reward_std": 0.10446885973215103,
|
|
"rewards/grpo_reward_func/mean": 0.30494439601898193,
|
|
"rewards/grpo_reward_func/std": 0.13383115828037262,
|
|
"step": 73
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.15625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.1875,
|
|
"kl": 0.00025242754054488614,
|
|
"learning_rate": 4.87962962962963e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1868936.0,
|
|
"reward": 0.21628513932228088,
|
|
"reward_std": 0.09445344656705856,
|
|
"rewards/grpo_reward_func/mean": 0.21628513932228088,
|
|
"rewards/grpo_reward_func/std": 0.18989995121955872,
|
|
"step": 74
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.171875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.25,
|
|
"kl": 0.00047839961189311,
|
|
"learning_rate": 4.87037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1893848.0,
|
|
"reward": 0.38288578391075134,
|
|
"reward_std": 0.1278231143951416,
|
|
"rewards/grpo_reward_func/mean": 0.38288578391075134,
|
|
"rewards/grpo_reward_func/std": 0.1358482390642166,
|
|
"step": 75
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.1875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.75,
|
|
"kl": 0.00021722633391618729,
|
|
"learning_rate": 4.861111111111111e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1918984.0,
|
|
"reward": 0.2882534861564636,
|
|
"reward_std": 0.1554815173149109,
|
|
"rewards/grpo_reward_func/mean": 0.2882534861564636,
|
|
"rewards/grpo_reward_func/std": 0.214204341173172,
|
|
"step": 76
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.203125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.375,
|
|
"kl": 0.00019398704171180725,
|
|
"learning_rate": 4.851851851851852e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1944584.0,
|
|
"reward": 0.21863818168640137,
|
|
"reward_std": 0.11801601201295853,
|
|
"rewards/grpo_reward_func/mean": 0.21863818168640137,
|
|
"rewards/grpo_reward_func/std": 0.1758543998003006,
|
|
"step": 77
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.21875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.375,
|
|
"kl": 0.00019207410514354706,
|
|
"learning_rate": 4.842592592592593e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1969664.0,
|
|
"reward": 0.41489866375923157,
|
|
"reward_std": 0.1088038831949234,
|
|
"rewards/grpo_reward_func/mean": 0.41489866375923157,
|
|
"rewards/grpo_reward_func/std": 0.17758165299892426,
|
|
"step": 78
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.234375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.9375,
|
|
"kl": 0.00017749394464772195,
|
|
"learning_rate": 4.833333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1994640.0,
|
|
"reward": 0.42801088094711304,
|
|
"reward_std": 0.10921289026737213,
|
|
"rewards/grpo_reward_func/mean": 0.42801088094711304,
|
|
"rewards/grpo_reward_func/std": 0.14644165337085724,
|
|
"step": 79
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.25,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.6875,
|
|
"kl": 0.00038930773735046387,
|
|
"learning_rate": 4.824074074074074e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2020152.0,
|
|
"reward": 0.16573889553546906,
|
|
"reward_std": 0.08702096343040466,
|
|
"rewards/grpo_reward_func/mean": 0.16573889553546906,
|
|
"rewards/grpo_reward_func/std": 0.12194234132766724,
|
|
"step": 80
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.265625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.96875,
|
|
"kl": 0.00011919004100491293,
|
|
"learning_rate": 4.814814814814814e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2044944.0,
|
|
"reward": 0.41396480798721313,
|
|
"reward_std": 0.09650741517543793,
|
|
"rewards/grpo_reward_func/mean": 0.41396480798721313,
|
|
"rewards/grpo_reward_func/std": 0.11427146941423416,
|
|
"step": 81
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.28125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.75,
|
|
"kl": 0.0002481179908500053,
|
|
"learning_rate": 4.805555555555555e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2070520.0,
|
|
"reward": 0.22157645225524902,
|
|
"reward_std": 0.09660083055496216,
|
|
"rewards/grpo_reward_func/mean": 0.22157645225524902,
|
|
"rewards/grpo_reward_func/std": 0.20230649411678314,
|
|
"step": 82
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.296875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.5625,
|
|
"kl": 0.00045967100595589727,
|
|
"learning_rate": 4.796296296296296e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2095464.0,
|
|
"reward": 0.5041211843490601,
|
|
"reward_std": 0.1389543116092682,
|
|
"rewards/grpo_reward_func/mean": 0.5041211843490601,
|
|
"rewards/grpo_reward_func/std": 0.1713102161884308,
|
|
"step": 83
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.3125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.03125,
|
|
"kl": 0.0001403558962920215,
|
|
"learning_rate": 4.787037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2120688.0,
|
|
"reward": 0.18113520741462708,
|
|
"reward_std": 0.04671812802553177,
|
|
"rewards/grpo_reward_func/mean": 0.18113520741462708,
|
|
"rewards/grpo_reward_func/std": 0.12348335981369019,
|
|
"step": 84
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.328125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.25,
|
|
"kl": 0.00013873229181626812,
|
|
"learning_rate": 4.777777777777778e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2146368.0,
|
|
"reward": 0.04078834131360054,
|
|
"reward_std": 0.12815354764461517,
|
|
"rewards/grpo_reward_func/mean": 0.04078834131360054,
|
|
"rewards/grpo_reward_func/std": 0.12462079524993896,
|
|
"step": 85
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.34375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.375,
|
|
"kl": 0.0004987806605640799,
|
|
"learning_rate": 4.768518518518518e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2171880.0,
|
|
"reward": 0.12024304270744324,
|
|
"reward_std": 0.1480271965265274,
|
|
"rewards/grpo_reward_func/mean": 0.12024304270744324,
|
|
"rewards/grpo_reward_func/std": 0.1638474464416504,
|
|
"step": 86
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.359375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.5,
|
|
"kl": 0.0003001453878823668,
|
|
"learning_rate": 4.759259259259259e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2196888.0,
|
|
"reward": 0.3662007451057434,
|
|
"reward_std": 0.10381059348583221,
|
|
"rewards/grpo_reward_func/mean": 0.3662007451057434,
|
|
"rewards/grpo_reward_func/std": 0.10623158514499664,
|
|
"step": 87
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.875,
|
|
"kl": 0.00020138671243330464,
|
|
"learning_rate": 4.7499999999999995e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2221912.0,
|
|
"reward": 0.3808116912841797,
|
|
"reward_std": 0.06605124473571777,
|
|
"rewards/grpo_reward_func/mean": 0.3808116912841797,
|
|
"rewards/grpo_reward_func/std": 0.0681883841753006,
|
|
"step": 88
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.390625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.6875,
|
|
"kl": 0.00010015349835157394,
|
|
"learning_rate": 4.7407407407407405e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2247072.0,
|
|
"reward": 0.29432040452957153,
|
|
"reward_std": 0.06491278856992722,
|
|
"rewards/grpo_reward_func/mean": 0.29432040452957153,
|
|
"rewards/grpo_reward_func/std": 0.11752089112997055,
|
|
"step": 89
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.40625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.1875,
|
|
"kl": 0.000202179577172501,
|
|
"learning_rate": 4.731481481481481e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2272480.0,
|
|
"reward": 0.2932785451412201,
|
|
"reward_std": 0.08380497992038727,
|
|
"rewards/grpo_reward_func/mean": 0.2932785451412201,
|
|
"rewards/grpo_reward_func/std": 0.23896603286266327,
|
|
"step": 90
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.421875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.6875,
|
|
"kl": 0.0004609463067026809,
|
|
"learning_rate": 4.722222222222222e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2297896.0,
|
|
"reward": 0.21311567723751068,
|
|
"reward_std": 0.09745917469263077,
|
|
"rewards/grpo_reward_func/mean": 0.21311567723751068,
|
|
"rewards/grpo_reward_func/std": 0.16419465839862823,
|
|
"step": 91
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.4375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.5,
|
|
"kl": 0.0002996331677422859,
|
|
"learning_rate": 4.7129629629629626e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2322992.0,
|
|
"reward": 0.2457921952009201,
|
|
"reward_std": 0.2100134938955307,
|
|
"rewards/grpo_reward_func/mean": 0.2457921952009201,
|
|
"rewards/grpo_reward_func/std": 0.2291945517063141,
|
|
"step": 92
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.453125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.25,
|
|
"kl": 0.00025584021204849705,
|
|
"learning_rate": 4.7037037037037036e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2348168.0,
|
|
"reward": 0.26080936193466187,
|
|
"reward_std": 0.10516969859600067,
|
|
"rewards/grpo_reward_func/mean": 0.26080936193466187,
|
|
"rewards/grpo_reward_func/std": 0.13873416185379028,
|
|
"step": 93
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.46875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.125,
|
|
"kl": 0.00024177134764613584,
|
|
"learning_rate": 4.694444444444444e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2373200.0,
|
|
"reward": 0.20706593990325928,
|
|
"reward_std": 0.09836722910404205,
|
|
"rewards/grpo_reward_func/mean": 0.20706593990325928,
|
|
"rewards/grpo_reward_func/std": 0.14156295359134674,
|
|
"step": 94
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.484375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.1875,
|
|
"kl": 0.00038407588726840913,
|
|
"learning_rate": 4.6851851851851846e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2398136.0,
|
|
"reward": 0.4286128580570221,
|
|
"reward_std": 0.14354585111141205,
|
|
"rewards/grpo_reward_func/mean": 0.4286128580570221,
|
|
"rewards/grpo_reward_func/std": 0.1891767978668213,
|
|
"step": 95
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.5,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.40625,
|
|
"kl": 0.0002914089673140552,
|
|
"learning_rate": 4.675925925925926e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2423008.0,
|
|
"reward": 0.32350456714630127,
|
|
"reward_std": 0.07581804692745209,
|
|
"rewards/grpo_reward_func/mean": 0.32350456714630127,
|
|
"rewards/grpo_reward_func/std": 0.14216163754463196,
|
|
"step": 96
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.515625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.75,
|
|
"kl": 0.00029821879797964357,
|
|
"learning_rate": 4.6666666666666666e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2447880.0,
|
|
"reward": 0.53780198097229,
|
|
"reward_std": 0.13564899563789368,
|
|
"rewards/grpo_reward_func/mean": 0.53780198097229,
|
|
"rewards/grpo_reward_func/std": 0.13389934599399567,
|
|
"step": 97
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.53125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.9375,
|
|
"kl": 0.00042561162263154984,
|
|
"learning_rate": 4.657407407407407e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2473776.0,
|
|
"reward": 0.0427071787416935,
|
|
"reward_std": 0.08712250739336014,
|
|
"rewards/grpo_reward_func/mean": 0.0427071787416935,
|
|
"rewards/grpo_reward_func/std": 0.08546540886163712,
|
|
"step": 98
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.546875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.84375,
|
|
"kl": 0.0001698797568678856,
|
|
"learning_rate": 4.6481481481481476e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2499040.0,
|
|
"reward": 0.24955210089683533,
|
|
"reward_std": 0.06630256026983261,
|
|
"rewards/grpo_reward_func/mean": 0.24955210089683533,
|
|
"rewards/grpo_reward_func/std": 0.09607692807912827,
|
|
"step": 99
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.5625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.8125,
|
|
"kl": 0.0011094513902207837,
|
|
"learning_rate": 4.6388888888888886e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2524040.0,
|
|
"reward": 0.37988805770874023,
|
|
"reward_std": 0.06495396792888641,
|
|
"rewards/grpo_reward_func/mean": 0.37988805770874023,
|
|
"rewards/grpo_reward_func/std": 0.07306870073080063,
|
|
"step": 100
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.578125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.0625,
|
|
"kl": 0.0002060849146801047,
|
|
"learning_rate": 4.6296296296296297e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2548920.0,
|
|
"reward": 0.3687247037887573,
|
|
"reward_std": 0.09085649251937866,
|
|
"rewards/grpo_reward_func/mean": 0.3687247037887573,
|
|
"rewards/grpo_reward_func/std": 0.09450332075357437,
|
|
"step": 101
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.59375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.875,
|
|
"kl": 0.00026761522894958034,
|
|
"learning_rate": 4.62037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2574456.0,
|
|
"reward": 0.09724560379981995,
|
|
"reward_std": 0.16156351566314697,
|
|
"rewards/grpo_reward_func/mean": 0.09724560379981995,
|
|
"rewards/grpo_reward_func/std": 0.16980990767478943,
|
|
"step": 102
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.609375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.75,
|
|
"kl": 0.0004442011268110946,
|
|
"learning_rate": 4.611111111111111e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2599584.0,
|
|
"reward": 0.2229943573474884,
|
|
"reward_std": 0.0884079784154892,
|
|
"rewards/grpo_reward_func/mean": 0.2229943573474884,
|
|
"rewards/grpo_reward_func/std": 0.09148803353309631,
|
|
"step": 103
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.75,
|
|
"kl": 0.00045584855251945555,
|
|
"learning_rate": 4.6018518518518517e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2624336.0,
|
|
"reward": 0.4652545750141144,
|
|
"reward_std": 0.09365322440862656,
|
|
"rewards/grpo_reward_func/mean": 0.4652545750141144,
|
|
"rewards/grpo_reward_func/std": 0.09660826623439789,
|
|
"step": 104
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.640625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.625,
|
|
"kl": 0.000538857959327288,
|
|
"learning_rate": 4.592592592592592e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2650008.0,
|
|
"reward": 0.22665664553642273,
|
|
"reward_std": 0.08802333474159241,
|
|
"rewards/grpo_reward_func/mean": 0.22665664553642273,
|
|
"rewards/grpo_reward_func/std": 0.15664224326610565,
|
|
"step": 105
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.65625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.3125,
|
|
"kl": 0.0010045859962701797,
|
|
"learning_rate": 4.5833333333333327e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2675152.0,
|
|
"reward": 0.27369487285614014,
|
|
"reward_std": 0.23123349249362946,
|
|
"rewards/grpo_reward_func/mean": 0.27369487285614014,
|
|
"rewards/grpo_reward_func/std": 0.24010290205478668,
|
|
"step": 106
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.671875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.875,
|
|
"kl": 0.00037650827107427176,
|
|
"learning_rate": 4.574074074074074e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2700032.0,
|
|
"reward": 0.3958902955055237,
|
|
"reward_std": 0.17344766855239868,
|
|
"rewards/grpo_reward_func/mean": 0.3958902955055237,
|
|
"rewards/grpo_reward_func/std": 0.17110610008239746,
|
|
"step": 107
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.6875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.375,
|
|
"kl": 0.0003967365773860365,
|
|
"learning_rate": 4.564814814814815e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2725416.0,
|
|
"reward": 0.31058400869369507,
|
|
"reward_std": 0.1508956253528595,
|
|
"rewards/grpo_reward_func/mean": 0.31058400869369507,
|
|
"rewards/grpo_reward_func/std": 0.19560779631137848,
|
|
"step": 108
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.703125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.90625,
|
|
"kl": 0.0006652238371316344,
|
|
"learning_rate": 4.555555555555555e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2750208.0,
|
|
"reward": 0.4809446930885315,
|
|
"reward_std": 0.14983296394348145,
|
|
"rewards/grpo_reward_func/mean": 0.4809446930885315,
|
|
"rewards/grpo_reward_func/std": 0.14483514428138733,
|
|
"step": 109
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.71875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.4375,
|
|
"kl": 0.00035489382571540773,
|
|
"learning_rate": 4.5462962962962957e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2775504.0,
|
|
"reward": 0.22507444024085999,
|
|
"reward_std": 0.08060501515865326,
|
|
"rewards/grpo_reward_func/mean": 0.22507444024085999,
|
|
"rewards/grpo_reward_func/std": 0.16900670528411865,
|
|
"step": 110
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.734375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.625,
|
|
"kl": 0.0005258495511952788,
|
|
"learning_rate": 4.537037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2800328.0,
|
|
"reward": 0.3424970507621765,
|
|
"reward_std": 0.13174626231193542,
|
|
"rewards/grpo_reward_func/mean": 0.3424970507621765,
|
|
"rewards/grpo_reward_func/std": 0.1453038901090622,
|
|
"step": 111
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.75,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.8125,
|
|
"kl": 0.0002995077520608902,
|
|
"learning_rate": 4.527777777777778e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2825752.0,
|
|
"reward": 0.23321793973445892,
|
|
"reward_std": 0.09155033528804779,
|
|
"rewards/grpo_reward_func/mean": 0.23321793973445892,
|
|
"rewards/grpo_reward_func/std": 0.09220299124717712,
|
|
"step": 112
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.765625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.09375,
|
|
"kl": 0.0007548263820353895,
|
|
"learning_rate": 4.5185185185185183e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2850680.0,
|
|
"reward": 0.3993395268917084,
|
|
"reward_std": 0.12416817247867584,
|
|
"rewards/grpo_reward_func/mean": 0.3993395268917084,
|
|
"rewards/grpo_reward_func/std": 0.1299920380115509,
|
|
"step": 113
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.78125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.0,
|
|
"kl": 0.00034774900996126235,
|
|
"learning_rate": 4.5092592592592593e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2876104.0,
|
|
"reward": 0.3269900381565094,
|
|
"reward_std": 0.0974041149020195,
|
|
"rewards/grpo_reward_func/mean": 0.3269900381565094,
|
|
"rewards/grpo_reward_func/std": 0.24951301515102386,
|
|
"step": 114
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.796875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.875,
|
|
"kl": 0.0004522266535786912,
|
|
"learning_rate": 4.5e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2901024.0,
|
|
"reward": 0.42044663429260254,
|
|
"reward_std": 0.1444629728794098,
|
|
"rewards/grpo_reward_func/mean": 0.42044663429260254,
|
|
"rewards/grpo_reward_func/std": 0.14634843170642853,
|
|
"step": 115
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.8125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.8125,
|
|
"kl": 0.0006757595692761242,
|
|
"learning_rate": 4.4907407407407403e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2926040.0,
|
|
"reward": 0.34109240770339966,
|
|
"reward_std": 0.133412703871727,
|
|
"rewards/grpo_reward_func/mean": 0.34109240770339966,
|
|
"rewards/grpo_reward_func/std": 0.12967567145824432,
|
|
"step": 116
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.828125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.8125,
|
|
"kl": 0.0002280392945976928,
|
|
"learning_rate": 4.4814814814814813e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2951408.0,
|
|
"reward": 0.292447566986084,
|
|
"reward_std": 0.14831313490867615,
|
|
"rewards/grpo_reward_func/mean": 0.292447566986084,
|
|
"rewards/grpo_reward_func/std": 0.2516910433769226,
|
|
"step": 117
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.84375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.875,
|
|
"kl": 0.0001139113082899712,
|
|
"learning_rate": 4.4722222222222223e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2976456.0,
|
|
"reward": 0.3284240961074829,
|
|
"reward_std": 0.05646292120218277,
|
|
"rewards/grpo_reward_func/mean": 0.3284240961074829,
|
|
"rewards/grpo_reward_func/std": 0.08714665472507477,
|
|
"step": 118
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.859375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.5625,
|
|
"kl": 0.00031560100615024567,
|
|
"learning_rate": 4.462962962962963e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3001344.0,
|
|
"reward": 0.3213407099246979,
|
|
"reward_std": 0.07400795072317123,
|
|
"rewards/grpo_reward_func/mean": 0.3213407099246979,
|
|
"rewards/grpo_reward_func/std": 0.07413279265165329,
|
|
"step": 119
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.6875,
|
|
"kl": 0.00014330726116895676,
|
|
"learning_rate": 4.4537037037037033e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3026520.0,
|
|
"reward": 0.2996346056461334,
|
|
"reward_std": 0.10851671546697617,
|
|
"rewards/grpo_reward_func/mean": 0.2996346056461334,
|
|
"rewards/grpo_reward_func/std": 0.14822526276111603,
|
|
"step": 120
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.890625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.1875,
|
|
"kl": 0.0003187606780556962,
|
|
"learning_rate": 4.444444444444444e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3051592.0,
|
|
"reward": 0.3903766870498657,
|
|
"reward_std": 0.09058903157711029,
|
|
"rewards/grpo_reward_func/mean": 0.3903766870498657,
|
|
"rewards/grpo_reward_func/std": 0.0985388308763504,
|
|
"step": 121
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.90625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.5625,
|
|
"kl": 0.0011158532870467752,
|
|
"learning_rate": 4.4351851851851854e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3076160.0,
|
|
"reward": 0.4783337712287903,
|
|
"reward_std": 0.09513237327337265,
|
|
"rewards/grpo_reward_func/mean": 0.4783337712287903,
|
|
"rewards/grpo_reward_func/std": 0.15691599249839783,
|
|
"step": 122
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.921875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.75,
|
|
"kl": 0.0004599780368153006,
|
|
"learning_rate": 4.425925925925926e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3101632.0,
|
|
"reward": 0.21846909821033478,
|
|
"reward_std": 0.1349727213382721,
|
|
"rewards/grpo_reward_func/mean": 0.21846909821033478,
|
|
"rewards/grpo_reward_func/std": 0.1770293265581131,
|
|
"step": 123
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.9375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.0625,
|
|
"kl": 0.00045861614489695057,
|
|
"learning_rate": 4.4166666666666664e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3126688.0,
|
|
"reward": 0.35453712940216064,
|
|
"reward_std": 0.07507544755935669,
|
|
"rewards/grpo_reward_func/mean": 0.35453712940216064,
|
|
"rewards/grpo_reward_func/std": 0.08851905167102814,
|
|
"step": 124
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.953125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.375,
|
|
"kl": 0.000633828341960907,
|
|
"learning_rate": 4.4074074074074074e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3151600.0,
|
|
"reward": 0.42087167501449585,
|
|
"reward_std": 0.10288920998573303,
|
|
"rewards/grpo_reward_func/mean": 0.42087167501449585,
|
|
"rewards/grpo_reward_func/std": 0.2240506261587143,
|
|
"step": 125
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.96875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.125,
|
|
"kl": 0.0003298576921224594,
|
|
"learning_rate": 4.398148148148148e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3177000.0,
|
|
"reward": 0.20600494742393494,
|
|
"reward_std": 0.07367828488349915,
|
|
"rewards/grpo_reward_func/mean": 0.20600494742393494,
|
|
"rewards/grpo_reward_func/std": 0.15181554853916168,
|
|
"step": 126
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 1.984375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.8125,
|
|
"kl": 0.0007094539323588833,
|
|
"learning_rate": 4.3888888888888884e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3202096.0,
|
|
"reward": 0.3842203915119171,
|
|
"reward_std": 0.09158715605735779,
|
|
"rewards/grpo_reward_func/mean": 0.3842203915119171,
|
|
"rewards/grpo_reward_func/std": 0.18436087667942047,
|
|
"step": 127
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.0,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.0625,
|
|
"kl": 0.0004973138275090605,
|
|
"learning_rate": 4.3796296296296294e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3227632.0,
|
|
"reward": 0.14040973782539368,
|
|
"reward_std": 0.051458939909935,
|
|
"rewards/grpo_reward_func/mean": 0.14040973782539368,
|
|
"rewards/grpo_reward_func/std": 0.05102093145251274,
|
|
"step": 128
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.015625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.875,
|
|
"kl": 0.0005900245305383578,
|
|
"learning_rate": 4.3703703703703704e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3252728.0,
|
|
"reward": 0.23317044973373413,
|
|
"reward_std": 0.12486948072910309,
|
|
"rewards/grpo_reward_func/mean": 0.23317044973373413,
|
|
"rewards/grpo_reward_func/std": 0.18154384195804596,
|
|
"step": 129
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.03125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.4375,
|
|
"kl": 0.0003436406550463289,
|
|
"learning_rate": 4.361111111111111e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3277808.0,
|
|
"reward": 0.2726486921310425,
|
|
"reward_std": 0.10262490808963776,
|
|
"rewards/grpo_reward_func/mean": 0.2726486921310425,
|
|
"rewards/grpo_reward_func/std": 0.1630534678697586,
|
|
"step": 130
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.046875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.1875,
|
|
"kl": 0.0006743809208273888,
|
|
"learning_rate": 4.3518518518518514e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3302816.0,
|
|
"reward": 0.36714380979537964,
|
|
"reward_std": 0.05760783702135086,
|
|
"rewards/grpo_reward_func/mean": 0.36714380979537964,
|
|
"rewards/grpo_reward_func/std": 0.06256558746099472,
|
|
"step": 131
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.0625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.125,
|
|
"kl": 0.00044845137745141983,
|
|
"learning_rate": 4.342592592592592e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3327936.0,
|
|
"reward": 0.33067017793655396,
|
|
"reward_std": 0.08341982960700989,
|
|
"rewards/grpo_reward_func/mean": 0.33067017793655396,
|
|
"rewards/grpo_reward_func/std": 0.09450868517160416,
|
|
"step": 132
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.078125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.5625,
|
|
"kl": 0.0001626002267585136,
|
|
"learning_rate": 4.3333333333333335e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3353384.0,
|
|
"reward": 0.1081024780869484,
|
|
"reward_std": 0.04969579726457596,
|
|
"rewards/grpo_reward_func/mean": 0.1081024780869484,
|
|
"rewards/grpo_reward_func/std": 0.07089214026927948,
|
|
"step": 133
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.09375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.9375,
|
|
"kl": 0.0006542075570905581,
|
|
"learning_rate": 4.324074074074074e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3378104.0,
|
|
"reward": 0.5095803737640381,
|
|
"reward_std": 0.07591477781534195,
|
|
"rewards/grpo_reward_func/mean": 0.5095803737640381,
|
|
"rewards/grpo_reward_func/std": 0.15140148997306824,
|
|
"step": 134
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.109375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.6875,
|
|
"kl": 0.00048515634262003005,
|
|
"learning_rate": 4.3148148148148145e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3403664.0,
|
|
"reward": 0.28993886709213257,
|
|
"reward_std": 0.11650910973548889,
|
|
"rewards/grpo_reward_func/mean": 0.28993886709213257,
|
|
"rewards/grpo_reward_func/std": 0.2471858263015747,
|
|
"step": 135
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.9375,
|
|
"kl": 0.000738232396543026,
|
|
"learning_rate": 4.3055555555555555e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3428648.0,
|
|
"reward": 0.29969072341918945,
|
|
"reward_std": 0.077365942299366,
|
|
"rewards/grpo_reward_func/mean": 0.29969072341918945,
|
|
"rewards/grpo_reward_func/std": 0.07797099649906158,
|
|
"step": 136
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.140625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.8125,
|
|
"kl": 0.0003166009337292053,
|
|
"learning_rate": 4.296296296296296e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3453600.0,
|
|
"reward": 0.5390628576278687,
|
|
"reward_std": 0.09630399942398071,
|
|
"rewards/grpo_reward_func/mean": 0.5390628576278687,
|
|
"rewards/grpo_reward_func/std": 0.10668856650590897,
|
|
"step": 137
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.15625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.125,
|
|
"kl": 0.00017619847494643182,
|
|
"learning_rate": 4.287037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3479008.0,
|
|
"reward": 0.08932866156101227,
|
|
"reward_std": 0.11037556082010269,
|
|
"rewards/grpo_reward_func/mean": 0.08932866156101227,
|
|
"rewards/grpo_reward_func/std": 0.13285614550113678,
|
|
"step": 138
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.171875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.0625,
|
|
"kl": 0.0005297964962664992,
|
|
"learning_rate": 4.2777777777777775e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3503672.0,
|
|
"reward": 0.410762220621109,
|
|
"reward_std": 0.11265414208173752,
|
|
"rewards/grpo_reward_func/mean": 0.410762220621109,
|
|
"rewards/grpo_reward_func/std": 0.11756953597068787,
|
|
"step": 139
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.1875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.8125,
|
|
"kl": 0.0002442055119900033,
|
|
"learning_rate": 4.2685185185185186e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3528696.0,
|
|
"reward": 0.35452505946159363,
|
|
"reward_std": 0.08229420334100723,
|
|
"rewards/grpo_reward_func/mean": 0.35452505946159363,
|
|
"rewards/grpo_reward_func/std": 0.10353206098079681,
|
|
"step": 140
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.203125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.5,
|
|
"kl": 0.00030247091854107566,
|
|
"learning_rate": 4.259259259259259e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3554200.0,
|
|
"reward": 0.10482534766197205,
|
|
"reward_std": 0.16839897632598877,
|
|
"rewards/grpo_reward_func/mean": 0.10482534766197205,
|
|
"rewards/grpo_reward_func/std": 0.1643439382314682,
|
|
"step": 141
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.21875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.9375,
|
|
"kl": 0.0009008496999740601,
|
|
"learning_rate": 4.2499999999999995e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3579256.0,
|
|
"reward": 0.344069242477417,
|
|
"reward_std": 0.12131404876708984,
|
|
"rewards/grpo_reward_func/mean": 0.344069242477417,
|
|
"rewards/grpo_reward_func/std": 0.18182261288166046,
|
|
"step": 142
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.234375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.1875,
|
|
"kl": 0.0005545982421608642,
|
|
"learning_rate": 4.24074074074074e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3605048.0,
|
|
"reward": 0.1483575701713562,
|
|
"reward_std": 0.05153876543045044,
|
|
"rewards/grpo_reward_func/mean": 0.1483575701713562,
|
|
"rewards/grpo_reward_func/std": 0.09833282232284546,
|
|
"step": 143
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.25,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.125,
|
|
"kl": 0.0008367840491700917,
|
|
"learning_rate": 4.2314814814814816e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3630656.0,
|
|
"reward": 0.2435261607170105,
|
|
"reward_std": 0.10582385957241058,
|
|
"rewards/grpo_reward_func/mean": 0.2435261607170105,
|
|
"rewards/grpo_reward_func/std": 0.2306629717350006,
|
|
"step": 144
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.265625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.75,
|
|
"kl": 0.0006366781890392303,
|
|
"learning_rate": 4.222222222222222e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3655736.0,
|
|
"reward": 0.25244393944740295,
|
|
"reward_std": 0.13968312740325928,
|
|
"rewards/grpo_reward_func/mean": 0.25244393944740295,
|
|
"rewards/grpo_reward_func/std": 0.20445255935192108,
|
|
"step": 145
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.28125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.9375,
|
|
"kl": 0.0004489117636694573,
|
|
"learning_rate": 4.2129629629629626e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3681032.0,
|
|
"reward": 0.2341691255569458,
|
|
"reward_std": 0.08564946055412292,
|
|
"rewards/grpo_reward_func/mean": 0.2341691255569458,
|
|
"rewards/grpo_reward_func/std": 0.14472095668315887,
|
|
"step": 146
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.296875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.875,
|
|
"kl": 0.0003370772956259316,
|
|
"learning_rate": 4.2037037037037036e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3705952.0,
|
|
"reward": 0.30148059129714966,
|
|
"reward_std": 0.07144688069820404,
|
|
"rewards/grpo_reward_func/mean": 0.30148059129714966,
|
|
"rewards/grpo_reward_func/std": 0.18274889886379242,
|
|
"step": 147
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.3125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.25,
|
|
"kl": 0.0002702403216972016,
|
|
"learning_rate": 4.194444444444444e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3731040.0,
|
|
"reward": 0.36687034368515015,
|
|
"reward_std": 0.13937704265117645,
|
|
"rewards/grpo_reward_func/mean": 0.36687034368515015,
|
|
"rewards/grpo_reward_func/std": 0.14671309292316437,
|
|
"step": 148
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.328125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.1875,
|
|
"kl": 0.0001179131959361257,
|
|
"learning_rate": 4.185185185185185e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3755968.0,
|
|
"reward": 0.35363078117370605,
|
|
"reward_std": 0.0943349301815033,
|
|
"rewards/grpo_reward_func/mean": 0.35363078117370605,
|
|
"rewards/grpo_reward_func/std": 0.10976386070251465,
|
|
"step": 149
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.34375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.5,
|
|
"kl": 0.0003840612989733927,
|
|
"learning_rate": 4.1759259259259256e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3781184.0,
|
|
"reward": 0.2982335686683655,
|
|
"reward_std": 0.09461906552314758,
|
|
"rewards/grpo_reward_func/mean": 0.2982335686683655,
|
|
"rewards/grpo_reward_func/std": 0.15051327645778656,
|
|
"step": 150
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.359375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.0625,
|
|
"kl": 0.0006028078350936994,
|
|
"learning_rate": 4.1666666666666667e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3806432.0,
|
|
"reward": 0.22157108783721924,
|
|
"reward_std": 0.07859226316213608,
|
|
"rewards/grpo_reward_func/mean": 0.22157108783721924,
|
|
"rewards/grpo_reward_func/std": 0.17600607872009277,
|
|
"step": 151
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.4375,
|
|
"kl": 0.00030071971559664235,
|
|
"learning_rate": 4.157407407407407e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3831640.0,
|
|
"reward": 0.2901885509490967,
|
|
"reward_std": 0.11274297535419464,
|
|
"rewards/grpo_reward_func/mean": 0.2901885509490967,
|
|
"rewards/grpo_reward_func/std": 0.21348397433757782,
|
|
"step": 152
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.390625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.625,
|
|
"kl": 0.0004837081505684182,
|
|
"learning_rate": 4.1481481481481476e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3857408.0,
|
|
"reward": 0.09370775520801544,
|
|
"reward_std": 0.10032463073730469,
|
|
"rewards/grpo_reward_func/mean": 0.09370775520801544,
|
|
"rewards/grpo_reward_func/std": 0.11353815346956253,
|
|
"step": 153
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.40625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.1875,
|
|
"kl": 0.0006801660056225955,
|
|
"learning_rate": 4.1388888888888887e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3882816.0,
|
|
"reward": 0.2329578548669815,
|
|
"reward_std": 0.12824667990207672,
|
|
"rewards/grpo_reward_func/mean": 0.2329578548669815,
|
|
"rewards/grpo_reward_func/std": 0.24429479241371155,
|
|
"step": 154
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.421875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.125,
|
|
"kl": 0.00012743038450935273,
|
|
"learning_rate": 4.1296296296296297e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3908224.0,
|
|
"reward": 0.22462239861488342,
|
|
"reward_std": 0.06313692033290863,
|
|
"rewards/grpo_reward_func/mean": 0.22462239861488342,
|
|
"rewards/grpo_reward_func/std": 0.15244141221046448,
|
|
"step": 155
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.4375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.875,
|
|
"kl": 0.0007045886595733464,
|
|
"learning_rate": 4.12037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3933728.0,
|
|
"reward": 0.07521216571331024,
|
|
"reward_std": 0.07558181881904602,
|
|
"rewards/grpo_reward_func/mean": 0.07521216571331024,
|
|
"rewards/grpo_reward_func/std": 0.09718126058578491,
|
|
"step": 156
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.453125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.375,
|
|
"kl": 0.00023933003103593364,
|
|
"learning_rate": 4.1111111111111107e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3958840.0,
|
|
"reward": 0.2714136838912964,
|
|
"reward_std": 0.13314189016819,
|
|
"rewards/grpo_reward_func/mean": 0.2714136838912964,
|
|
"rewards/grpo_reward_func/std": 0.1861964911222458,
|
|
"step": 157
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.46875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.75,
|
|
"kl": 0.00025627949071349576,
|
|
"learning_rate": 4.1018518518518517e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3983984.0,
|
|
"reward": 0.30816277861595154,
|
|
"reward_std": 0.07617802917957306,
|
|
"rewards/grpo_reward_func/mean": 0.30816277861595154,
|
|
"rewards/grpo_reward_func/std": 0.09437035024166107,
|
|
"step": 158
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.484375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.9375,
|
|
"kl": 0.0006072036921977997,
|
|
"learning_rate": 4.092592592592593e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4009128.0,
|
|
"reward": 0.3487386405467987,
|
|
"reward_std": 0.10052464157342911,
|
|
"rewards/grpo_reward_func/mean": 0.3487386405467987,
|
|
"rewards/grpo_reward_func/std": 0.09949901700019836,
|
|
"step": 159
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.5,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.6875,
|
|
"kl": 0.0005894883797736838,
|
|
"learning_rate": 4.083333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4034272.0,
|
|
"reward": 0.20100285112857819,
|
|
"reward_std": 0.07660327851772308,
|
|
"rewards/grpo_reward_func/mean": 0.20100285112857819,
|
|
"rewards/grpo_reward_func/std": 0.10585327446460724,
|
|
"step": 160
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.515625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.5,
|
|
"kl": 0.0003513234405545518,
|
|
"learning_rate": 4.0740740740740737e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4060464.0,
|
|
"reward": 0.17975829541683197,
|
|
"reward_std": 0.12321469932794571,
|
|
"rewards/grpo_reward_func/mean": 0.17975829541683197,
|
|
"rewards/grpo_reward_func/std": 0.19033879041671753,
|
|
"step": 161
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.53125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.5625,
|
|
"kl": 0.0007510657014790922,
|
|
"learning_rate": 4.064814814814815e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4085416.0,
|
|
"reward": 0.4079420566558838,
|
|
"reward_std": 0.1232195794582367,
|
|
"rewards/grpo_reward_func/mean": 0.4079420566558838,
|
|
"rewards/grpo_reward_func/std": 0.12682023644447327,
|
|
"step": 162
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.546875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.8125,
|
|
"kl": 0.0005510623304871842,
|
|
"learning_rate": 4.055555555555555e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4110488.0,
|
|
"reward": 0.25836923718452454,
|
|
"reward_std": 0.08329159766435623,
|
|
"rewards/grpo_reward_func/mean": 0.25836923718452454,
|
|
"rewards/grpo_reward_func/std": 0.08466833829879761,
|
|
"step": 163
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.5625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 17.125,
|
|
"kl": 0.0007114599866326898,
|
|
"learning_rate": 4.046296296296296e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4136408.0,
|
|
"reward": 0.12126512080430984,
|
|
"reward_std": 0.08648187667131424,
|
|
"rewards/grpo_reward_func/mean": 0.12126512080430984,
|
|
"rewards/grpo_reward_func/std": 0.10869091749191284,
|
|
"step": 164
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.578125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.5625,
|
|
"kl": 0.00045018985110800713,
|
|
"learning_rate": 4.0370370370370373e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4161264.0,
|
|
"reward": 0.4755350947380066,
|
|
"reward_std": 0.11967408657073975,
|
|
"rewards/grpo_reward_func/mean": 0.4755350947380066,
|
|
"rewards/grpo_reward_func/std": 0.13084648549556732,
|
|
"step": 165
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.59375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.5,
|
|
"kl": 0.0006938679289305583,
|
|
"learning_rate": 4.027777777777778e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4186760.0,
|
|
"reward": 0.27864474058151245,
|
|
"reward_std": 0.13842284679412842,
|
|
"rewards/grpo_reward_func/mean": 0.27864474058151245,
|
|
"rewards/grpo_reward_func/std": 0.26182281970977783,
|
|
"step": 166
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.609375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.75,
|
|
"kl": 0.00033565983176231384,
|
|
"learning_rate": 4.0185185185185183e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4211760.0,
|
|
"reward": 0.32802748680114746,
|
|
"reward_std": 0.0578024685382843,
|
|
"rewards/grpo_reward_func/mean": 0.32802748680114746,
|
|
"rewards/grpo_reward_func/std": 0.05809301882982254,
|
|
"step": 167
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.875,
|
|
"kl": 0.0002027436494245194,
|
|
"learning_rate": 4.009259259259259e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4236952.0,
|
|
"reward": 0.27089524269104004,
|
|
"reward_std": 0.09902771562337875,
|
|
"rewards/grpo_reward_func/mean": 0.27089524269104004,
|
|
"rewards/grpo_reward_func/std": 0.1164289340376854,
|
|
"step": 168
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.640625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.9375,
|
|
"kl": 0.00028807235503336415,
|
|
"learning_rate": 4e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4262032.0,
|
|
"reward": 0.2959464192390442,
|
|
"reward_std": 0.08586762100458145,
|
|
"rewards/grpo_reward_func/mean": 0.2959464192390442,
|
|
"rewards/grpo_reward_func/std": 0.09414460510015488,
|
|
"step": 169
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.65625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.9375,
|
|
"kl": 0.0005565596075030044,
|
|
"learning_rate": 3.990740740740741e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4286880.0,
|
|
"reward": 0.42600101232528687,
|
|
"reward_std": 0.13071218132972717,
|
|
"rewards/grpo_reward_func/mean": 0.42600101232528687,
|
|
"rewards/grpo_reward_func/std": 0.1445237398147583,
|
|
"step": 170
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.671875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.1875,
|
|
"kl": 0.0007460726046701893,
|
|
"learning_rate": 3.9814814814814813e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4312808.0,
|
|
"reward": 0.10033319890499115,
|
|
"reward_std": 0.06247374042868614,
|
|
"rewards/grpo_reward_func/mean": 0.10033319890499115,
|
|
"rewards/grpo_reward_func/std": 0.06105644628405571,
|
|
"step": 171
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.6875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.84375,
|
|
"kl": 0.0002445072532282211,
|
|
"learning_rate": 3.972222222222222e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4337864.0,
|
|
"reward": 0.2938867211341858,
|
|
"reward_std": 0.11876720935106277,
|
|
"rewards/grpo_reward_func/mean": 0.2938867211341858,
|
|
"rewards/grpo_reward_func/std": 0.14251013100147247,
|
|
"step": 172
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.703125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.8125,
|
|
"kl": 0.0003134909420623444,
|
|
"learning_rate": 3.962962962962963e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4363144.0,
|
|
"reward": 0.2597373127937317,
|
|
"reward_std": 0.1320008635520935,
|
|
"rewards/grpo_reward_func/mean": 0.2597373127937317,
|
|
"rewards/grpo_reward_func/std": 0.14323653280735016,
|
|
"step": 173
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.71875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.5,
|
|
"kl": 0.0004200125113129616,
|
|
"learning_rate": 3.9537037037037034e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4387896.0,
|
|
"reward": 0.3413686752319336,
|
|
"reward_std": 0.11463446915149689,
|
|
"rewards/grpo_reward_func/mean": 0.3413686752319336,
|
|
"rewards/grpo_reward_func/std": 0.14729353785514832,
|
|
"step": 174
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.734375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.625,
|
|
"kl": 0.0007713943195994943,
|
|
"learning_rate": 3.9444444444444444e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4412888.0,
|
|
"reward": 0.25845998525619507,
|
|
"reward_std": 0.07939323782920837,
|
|
"rewards/grpo_reward_func/mean": 0.25845998525619507,
|
|
"rewards/grpo_reward_func/std": 0.21095220744609833,
|
|
"step": 175
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.75,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.25,
|
|
"kl": 0.0008315810118801892,
|
|
"learning_rate": 3.9351851851851854e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4438472.0,
|
|
"reward": 0.2494005262851715,
|
|
"reward_std": 0.12670020759105682,
|
|
"rewards/grpo_reward_func/mean": 0.2494005262851715,
|
|
"rewards/grpo_reward_func/std": 0.24672208726406097,
|
|
"step": 176
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.765625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.25,
|
|
"kl": 0.0003384503797860816,
|
|
"learning_rate": 3.925925925925926e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4463536.0,
|
|
"reward": 0.3772205710411072,
|
|
"reward_std": 0.13446420431137085,
|
|
"rewards/grpo_reward_func/mean": 0.3772205710411072,
|
|
"rewards/grpo_reward_func/std": 0.13952113687992096,
|
|
"step": 177
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.78125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.375,
|
|
"kl": 0.0008698782767169178,
|
|
"learning_rate": 3.9166666666666664e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4489168.0,
|
|
"reward": 0.25278976559638977,
|
|
"reward_std": 0.13060712814331055,
|
|
"rewards/grpo_reward_func/mean": 0.25278976559638977,
|
|
"rewards/grpo_reward_func/std": 0.24315965175628662,
|
|
"step": 178
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.796875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.5,
|
|
"kl": 0.0002482995987520553,
|
|
"learning_rate": 3.907407407407407e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4514576.0,
|
|
"reward": 0.26432496309280396,
|
|
"reward_std": 0.10723777115345001,
|
|
"rewards/grpo_reward_func/mean": 0.26432496309280396,
|
|
"rewards/grpo_reward_func/std": 0.16780295968055725,
|
|
"step": 179
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.8125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.75,
|
|
"kl": 0.0007870141416788101,
|
|
"learning_rate": 3.898148148148148e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4539328.0,
|
|
"reward": 0.40482616424560547,
|
|
"reward_std": 0.16082629561424255,
|
|
"rewards/grpo_reward_func/mean": 0.40482616424560547,
|
|
"rewards/grpo_reward_func/std": 0.15713298320770264,
|
|
"step": 180
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.828125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.125,
|
|
"kl": 0.0006626859976677224,
|
|
"learning_rate": 3.888888888888889e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4564408.0,
|
|
"reward": 0.35017871856689453,
|
|
"reward_std": 0.11673957854509354,
|
|
"rewards/grpo_reward_func/mean": 0.35017871856689453,
|
|
"rewards/grpo_reward_func/std": 0.12969790399074554,
|
|
"step": 181
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.84375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 19.625,
|
|
"kl": 0.000518678076332435,
|
|
"learning_rate": 3.8796296296296294e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4589304.0,
|
|
"reward": 0.29897165298461914,
|
|
"reward_std": 0.1672011762857437,
|
|
"rewards/grpo_reward_func/mean": 0.29897165298461914,
|
|
"rewards/grpo_reward_func/std": 0.17964954674243927,
|
|
"step": 182
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.859375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.1875,
|
|
"kl": 0.0003269196895416826,
|
|
"learning_rate": 3.87037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4614184.0,
|
|
"reward": 0.3464832901954651,
|
|
"reward_std": 0.09929412603378296,
|
|
"rewards/grpo_reward_func/mean": 0.3464832901954651,
|
|
"rewards/grpo_reward_func/std": 0.1351390779018402,
|
|
"step": 183
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.6875,
|
|
"kl": 0.0004685633030021563,
|
|
"learning_rate": 3.861111111111111e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4640008.0,
|
|
"reward": 0.1745709925889969,
|
|
"reward_std": 0.17333576083183289,
|
|
"rewards/grpo_reward_func/mean": 0.1745709925889969,
|
|
"rewards/grpo_reward_func/std": 0.1874336302280426,
|
|
"step": 184
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.890625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.125,
|
|
"kl": 0.00043425335024949163,
|
|
"learning_rate": 3.8518518518518515e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4665312.0,
|
|
"reward": 0.28968238830566406,
|
|
"reward_std": 0.16824908554553986,
|
|
"rewards/grpo_reward_func/mean": 0.28968238830566406,
|
|
"rewards/grpo_reward_func/std": 0.2248057723045349,
|
|
"step": 185
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.90625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.0625,
|
|
"kl": 0.0006378417165251449,
|
|
"learning_rate": 3.8425925925925925e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4690392.0,
|
|
"reward": 0.397042453289032,
|
|
"reward_std": 0.17139402031898499,
|
|
"rewards/grpo_reward_func/mean": 0.397042453289032,
|
|
"rewards/grpo_reward_func/std": 0.2513841390609741,
|
|
"step": 186
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.921875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.6875,
|
|
"kl": 0.0007361011957982555,
|
|
"learning_rate": 3.8333333333333335e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4714992.0,
|
|
"reward": 0.29339537024497986,
|
|
"reward_std": 0.09461888670921326,
|
|
"rewards/grpo_reward_func/mean": 0.29339537024497986,
|
|
"rewards/grpo_reward_func/std": 0.11325549334287643,
|
|
"step": 187
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.9375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 24.375,
|
|
"kl": 0.0004989129301975481,
|
|
"learning_rate": 3.824074074074074e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4739696.0,
|
|
"reward": 0.38193440437316895,
|
|
"reward_std": 0.15044079720973969,
|
|
"rewards/grpo_reward_func/mean": 0.38193440437316895,
|
|
"rewards/grpo_reward_func/std": 0.1526176482439041,
|
|
"step": 188
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.953125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.875,
|
|
"kl": 0.0008048738091019914,
|
|
"learning_rate": 3.8148148148148145e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4765432.0,
|
|
"reward": 0.10808064788579941,
|
|
"reward_std": 0.14257347583770752,
|
|
"rewards/grpo_reward_func/mean": 0.10808064788579941,
|
|
"rewards/grpo_reward_func/std": 0.1668616235256195,
|
|
"step": 189
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.96875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.125,
|
|
"kl": 0.0006854971870779991,
|
|
"learning_rate": 3.805555555555555e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4790728.0,
|
|
"reward": 0.3299209475517273,
|
|
"reward_std": 0.12696070969104767,
|
|
"rewards/grpo_reward_func/mean": 0.3299209475517273,
|
|
"rewards/grpo_reward_func/std": 0.14549556374549866,
|
|
"step": 190
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 2.984375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.4375,
|
|
"kl": 0.0005391652957769111,
|
|
"learning_rate": 3.7962962962962966e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4815896.0,
|
|
"reward": 0.24173803627490997,
|
|
"reward_std": 0.13923460245132446,
|
|
"rewards/grpo_reward_func/mean": 0.24173803627490997,
|
|
"rewards/grpo_reward_func/std": 0.13477925956249237,
|
|
"step": 191
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.0,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.375,
|
|
"kl": 0.0010796443675644696,
|
|
"learning_rate": 3.787037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4841448.0,
|
|
"reward": 0.1647602617740631,
|
|
"reward_std": 0.14355677366256714,
|
|
"rewards/grpo_reward_func/mean": 0.1647602617740631,
|
|
"rewards/grpo_reward_func/std": 0.15943202376365662,
|
|
"step": 192
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.015625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.5,
|
|
"kl": 0.00045456798397935927,
|
|
"learning_rate": 3.7777777777777775e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4866200.0,
|
|
"reward": 0.34182441234588623,
|
|
"reward_std": 0.1141589879989624,
|
|
"rewards/grpo_reward_func/mean": 0.34182441234588623,
|
|
"rewards/grpo_reward_func/std": 0.13174206018447876,
|
|
"step": 193
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.03125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.59375,
|
|
"kl": 0.0007237400859594345,
|
|
"learning_rate": 3.768518518518518e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4891192.0,
|
|
"reward": 0.3446623384952545,
|
|
"reward_std": 0.08384630084037781,
|
|
"rewards/grpo_reward_func/mean": 0.3446623384952545,
|
|
"rewards/grpo_reward_func/std": 0.08680541068315506,
|
|
"step": 194
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.046875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.6875,
|
|
"kl": 0.000518413566169329,
|
|
"learning_rate": 3.759259259259259e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4916504.0,
|
|
"reward": 0.2807949185371399,
|
|
"reward_std": 0.10580653697252274,
|
|
"rewards/grpo_reward_func/mean": 0.2807949185371399,
|
|
"rewards/grpo_reward_func/std": 0.19993965327739716,
|
|
"step": 195
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.0625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.4375,
|
|
"kl": 0.000430591702752281,
|
|
"learning_rate": 3.75e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4942040.0,
|
|
"reward": 0.17394839227199554,
|
|
"reward_std": 0.06569032371044159,
|
|
"rewards/grpo_reward_func/mean": 0.17394839227199554,
|
|
"rewards/grpo_reward_func/std": 0.19397369027137756,
|
|
"step": 196
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.078125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.75,
|
|
"kl": 0.00039455325168091804,
|
|
"learning_rate": 3.7407407407407406e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4967200.0,
|
|
"reward": 0.4116261601448059,
|
|
"reward_std": 0.18030327558517456,
|
|
"rewards/grpo_reward_func/mean": 0.4116261601448059,
|
|
"rewards/grpo_reward_func/std": 0.2310413271188736,
|
|
"step": 197
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.09375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.875,
|
|
"kl": 0.0005141782166901976,
|
|
"learning_rate": 3.7314814814814816e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4992016.0,
|
|
"reward": 0.46965691447257996,
|
|
"reward_std": 0.1314663141965866,
|
|
"rewards/grpo_reward_func/mean": 0.46965691447257996,
|
|
"rewards/grpo_reward_func/std": 0.17656759917736053,
|
|
"step": 198
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.109375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.4375,
|
|
"kl": 0.0005590068249148317,
|
|
"learning_rate": 3.722222222222222e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5016520.0,
|
|
"reward": 0.4341745972633362,
|
|
"reward_std": 0.10189318656921387,
|
|
"rewards/grpo_reward_func/mean": 0.4341745972633362,
|
|
"rewards/grpo_reward_func/std": 0.21076862514019012,
|
|
"step": 199
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.25,
|
|
"kl": 0.00043097294110339135,
|
|
"learning_rate": 3.7129629629629626e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5041600.0,
|
|
"reward": 0.19751757383346558,
|
|
"reward_std": 0.10619483888149261,
|
|
"rewards/grpo_reward_func/mean": 0.19751757383346558,
|
|
"rewards/grpo_reward_func/std": 0.19493362307548523,
|
|
"step": 200
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.140625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.0,
|
|
"kl": 0.0008377966587431729,
|
|
"learning_rate": 3.703703703703703e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5066544.0,
|
|
"reward": 0.2634657025337219,
|
|
"reward_std": 0.09939266741275787,
|
|
"rewards/grpo_reward_func/mean": 0.2634657025337219,
|
|
"rewards/grpo_reward_func/std": 0.11164335906505585,
|
|
"step": 201
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.15625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.8125,
|
|
"kl": 0.0007731840014457703,
|
|
"learning_rate": 3.6944444444444447e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5092384.0,
|
|
"reward": 0.09646777808666229,
|
|
"reward_std": 0.06861913204193115,
|
|
"rewards/grpo_reward_func/mean": 0.09646777808666229,
|
|
"rewards/grpo_reward_func/std": 0.07161495089530945,
|
|
"step": 202
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.171875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.625,
|
|
"kl": 0.0002781056537060067,
|
|
"learning_rate": 3.685185185185185e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5117632.0,
|
|
"reward": 0.3059152364730835,
|
|
"reward_std": 0.15640440583229065,
|
|
"rewards/grpo_reward_func/mean": 0.3059152364730835,
|
|
"rewards/grpo_reward_func/std": 0.25642770528793335,
|
|
"step": 203
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.1875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.1875,
|
|
"kl": 0.0006508687511086464,
|
|
"learning_rate": 3.6759259259259257e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5142432.0,
|
|
"reward": 0.4996418356895447,
|
|
"reward_std": 0.13480040431022644,
|
|
"rewards/grpo_reward_func/mean": 0.4996418356895447,
|
|
"rewards/grpo_reward_func/std": 0.14277315139770508,
|
|
"step": 204
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.203125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.6875,
|
|
"kl": 0.000795925036072731,
|
|
"learning_rate": 3.666666666666666e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5167304.0,
|
|
"reward": 0.42619913816452026,
|
|
"reward_std": 0.1849099099636078,
|
|
"rewards/grpo_reward_func/mean": 0.42619913816452026,
|
|
"rewards/grpo_reward_func/std": 0.18658678233623505,
|
|
"step": 205
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.21875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.0625,
|
|
"kl": 0.0011986760946456343,
|
|
"learning_rate": 3.657407407407407e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5192104.0,
|
|
"reward": 0.4016791880130768,
|
|
"reward_std": 0.07631438970565796,
|
|
"rewards/grpo_reward_func/mean": 0.4016791880130768,
|
|
"rewards/grpo_reward_func/std": 0.14636240899562836,
|
|
"step": 206
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.234375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.125,
|
|
"kl": 0.0007918803166830912,
|
|
"learning_rate": 3.648148148148148e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5217048.0,
|
|
"reward": 0.3634570837020874,
|
|
"reward_std": 0.13550926744937897,
|
|
"rewards/grpo_reward_func/mean": 0.3634570837020874,
|
|
"rewards/grpo_reward_func/std": 0.1402992159128189,
|
|
"step": 207
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.25,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.5625,
|
|
"kl": 0.0005829473811900243,
|
|
"learning_rate": 3.6388888888888887e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5242296.0,
|
|
"reward": 0.30222201347351074,
|
|
"reward_std": 0.14429670572280884,
|
|
"rewards/grpo_reward_func/mean": 0.30222201347351074,
|
|
"rewards/grpo_reward_func/std": 0.15859085321426392,
|
|
"step": 208
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.265625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.40625,
|
|
"kl": 0.00040928709495346993,
|
|
"learning_rate": 3.6296296296296297e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5267656.0,
|
|
"reward": 0.2286316603422165,
|
|
"reward_std": 0.09120648354291916,
|
|
"rewards/grpo_reward_func/mean": 0.2286316603422165,
|
|
"rewards/grpo_reward_func/std": 0.21030573546886444,
|
|
"step": 209
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.28125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.8125,
|
|
"kl": 0.0006142336205812171,
|
|
"learning_rate": 3.62037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5293176.0,
|
|
"reward": 0.14809495210647583,
|
|
"reward_std": 0.14708967506885529,
|
|
"rewards/grpo_reward_func/mean": 0.14809495210647583,
|
|
"rewards/grpo_reward_func/std": 0.16517038643360138,
|
|
"step": 210
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.296875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.8125,
|
|
"kl": 0.0008529710030416027,
|
|
"learning_rate": 3.6111111111111107e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5318936.0,
|
|
"reward": 0.17756229639053345,
|
|
"reward_std": 0.058169350028038025,
|
|
"rewards/grpo_reward_func/mean": 0.17756229639053345,
|
|
"rewards/grpo_reward_func/std": 0.13397441804409027,
|
|
"step": 211
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.3125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.1875,
|
|
"kl": 0.0003407594340387732,
|
|
"learning_rate": 3.601851851851852e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5344008.0,
|
|
"reward": 0.36280357837677,
|
|
"reward_std": 0.09298541396856308,
|
|
"rewards/grpo_reward_func/mean": 0.36280357837677,
|
|
"rewards/grpo_reward_func/std": 0.09538479149341583,
|
|
"step": 212
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.328125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.375,
|
|
"kl": 0.0005417931824922562,
|
|
"learning_rate": 3.592592592592593e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5368960.0,
|
|
"reward": 0.47765880823135376,
|
|
"reward_std": 0.1049264445900917,
|
|
"rewards/grpo_reward_func/mean": 0.47765880823135376,
|
|
"rewards/grpo_reward_func/std": 0.12036207318305969,
|
|
"step": 213
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 11.9375,
|
|
"completions/mean_terminated_length": 11.9375,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 3.34375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.0,
|
|
"kl": 0.0007974399486556649,
|
|
"learning_rate": 3.583333333333333e-07,
|
|
"loss": -0.005,
|
|
"num_tokens": 5394727.0,
|
|
"reward": 0.16735509037971497,
|
|
"reward_std": 0.0997590720653534,
|
|
"rewards/grpo_reward_func/mean": 0.16735509037971497,
|
|
"rewards/grpo_reward_func/std": 0.12222032994031906,
|
|
"step": 214
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.359375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.5,
|
|
"kl": 0.0013814661651849747,
|
|
"learning_rate": 3.574074074074074e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 5419783.0,
|
|
"reward": 0.3473682999610901,
|
|
"reward_std": 0.08365271985530853,
|
|
"rewards/grpo_reward_func/mean": 0.3473682999610901,
|
|
"rewards/grpo_reward_func/std": 0.10378436744213104,
|
|
"step": 215
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.875,
|
|
"kl": 0.000576346181333065,
|
|
"learning_rate": 3.564814814814814e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5445191.0,
|
|
"reward": 0.3035712242126465,
|
|
"reward_std": 0.1296510398387909,
|
|
"rewards/grpo_reward_func/mean": 0.3035712242126465,
|
|
"rewards/grpo_reward_func/std": 0.2325069159269333,
|
|
"step": 216
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.390625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.4375,
|
|
"kl": 0.0003792364223045297,
|
|
"learning_rate": 3.5555555555555553e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5471263.0,
|
|
"reward": 0.08952207118272781,
|
|
"reward_std": 0.060667045414447784,
|
|
"rewards/grpo_reward_func/mean": 0.08952207118272781,
|
|
"rewards/grpo_reward_func/std": 0.061015550047159195,
|
|
"step": 217
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.40625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.6875,
|
|
"kl": 0.00032404749072156847,
|
|
"learning_rate": 3.5462962962962963e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5496703.0,
|
|
"reward": 0.0992613434791565,
|
|
"reward_std": 0.07703530788421631,
|
|
"rewards/grpo_reward_func/mean": 0.0992613434791565,
|
|
"rewards/grpo_reward_func/std": 0.145080104470253,
|
|
"step": 218
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.421875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.3125,
|
|
"kl": 0.0006244319229153916,
|
|
"learning_rate": 3.537037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5521919.0,
|
|
"reward": 0.187003493309021,
|
|
"reward_std": 0.11275693774223328,
|
|
"rewards/grpo_reward_func/mean": 0.187003493309021,
|
|
"rewards/grpo_reward_func/std": 0.1791585236787796,
|
|
"step": 219
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.4375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.6875,
|
|
"kl": 0.0013147607969585806,
|
|
"learning_rate": 3.527777777777778e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 5547479.0,
|
|
"reward": 0.23747900128364563,
|
|
"reward_std": 0.1416703164577484,
|
|
"rewards/grpo_reward_func/mean": 0.23747900128364563,
|
|
"rewards/grpo_reward_func/std": 0.26003557443618774,
|
|
"step": 220
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.453125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.75,
|
|
"kl": 0.001335889071924612,
|
|
"learning_rate": 3.5185185185185183e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 5573023.0,
|
|
"reward": 0.3594636619091034,
|
|
"reward_std": 0.10562098026275635,
|
|
"rewards/grpo_reward_func/mean": 0.3594636619091034,
|
|
"rewards/grpo_reward_func/std": 0.2661304175853729,
|
|
"step": 221
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.46875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.25,
|
|
"kl": 0.0004436932358657941,
|
|
"learning_rate": 3.509259259259259e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5598591.0,
|
|
"reward": 0.15536442399024963,
|
|
"reward_std": 0.09705634415149689,
|
|
"rewards/grpo_reward_func/mean": 0.15536442399024963,
|
|
"rewards/grpo_reward_func/std": 0.177720844745636,
|
|
"step": 222
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.484375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.28125,
|
|
"kl": 0.00038727434002794325,
|
|
"learning_rate": 3.5e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5623975.0,
|
|
"reward": 0.2250569462776184,
|
|
"reward_std": 0.043075259774923325,
|
|
"rewards/grpo_reward_func/mean": 0.2250569462776184,
|
|
"rewards/grpo_reward_func/std": 0.15976740419864655,
|
|
"step": 223
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.5,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.0,
|
|
"kl": 0.0007129740115487948,
|
|
"learning_rate": 3.490740740740741e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5649511.0,
|
|
"reward": 0.22658474743366241,
|
|
"reward_std": 0.07546912878751755,
|
|
"rewards/grpo_reward_func/mean": 0.22658474743366241,
|
|
"rewards/grpo_reward_func/std": 0.18879064917564392,
|
|
"step": 224
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.515625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.9375,
|
|
"kl": 0.00010857979577849619,
|
|
"learning_rate": 3.4814814814814814e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5674447.0,
|
|
"reward": 0.33255600929260254,
|
|
"reward_std": 0.15443569421768188,
|
|
"rewards/grpo_reward_func/mean": 0.33255600929260254,
|
|
"rewards/grpo_reward_func/std": 0.1559605747461319,
|
|
"step": 225
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.53125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.0625,
|
|
"kl": 0.0010417526063974947,
|
|
"learning_rate": 3.472222222222222e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5699567.0,
|
|
"reward": 0.30261072516441345,
|
|
"reward_std": 0.06423477828502655,
|
|
"rewards/grpo_reward_func/mean": 0.30261072516441345,
|
|
"rewards/grpo_reward_func/std": 0.12586970627307892,
|
|
"step": 226
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.546875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.875,
|
|
"kl": 0.001145510614151135,
|
|
"learning_rate": 3.4629629629629624e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5725159.0,
|
|
"reward": 0.19406265020370483,
|
|
"reward_std": 0.10304485261440277,
|
|
"rewards/grpo_reward_func/mean": 0.19406265020370483,
|
|
"rewards/grpo_reward_func/std": 0.16005179286003113,
|
|
"step": 227
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.5625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.8125,
|
|
"kl": 0.0006747040679329075,
|
|
"learning_rate": 3.453703703703704e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5750279.0,
|
|
"reward": 0.2185659408569336,
|
|
"reward_std": 0.08110688626766205,
|
|
"rewards/grpo_reward_func/mean": 0.2185659408569336,
|
|
"rewards/grpo_reward_func/std": 0.10881200432777405,
|
|
"step": 228
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.578125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.625,
|
|
"kl": 0.00036149504012428224,
|
|
"learning_rate": 3.4444444444444444e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5775775.0,
|
|
"reward": 0.252541720867157,
|
|
"reward_std": 0.14369598031044006,
|
|
"rewards/grpo_reward_func/mean": 0.252541720867157,
|
|
"rewards/grpo_reward_func/std": 0.2099451869726181,
|
|
"step": 229
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.59375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.625,
|
|
"kl": 0.001367637887597084,
|
|
"learning_rate": 3.435185185185185e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 5801039.0,
|
|
"reward": 0.2519097328186035,
|
|
"reward_std": 0.1605014055967331,
|
|
"rewards/grpo_reward_func/mean": 0.2519097328186035,
|
|
"rewards/grpo_reward_func/std": 0.26890748739242554,
|
|
"step": 230
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.609375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.6875,
|
|
"kl": 0.0006766791047994047,
|
|
"learning_rate": 3.425925925925926e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5825991.0,
|
|
"reward": 0.3935144543647766,
|
|
"reward_std": 0.12401822954416275,
|
|
"rewards/grpo_reward_func/mean": 0.3935144543647766,
|
|
"rewards/grpo_reward_func/std": 0.1281329244375229,
|
|
"step": 231
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.6875,
|
|
"kl": 0.0010292638908140361,
|
|
"learning_rate": 3.4166666666666664e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5850967.0,
|
|
"reward": 0.3945986032485962,
|
|
"reward_std": 0.0977005809545517,
|
|
"rewards/grpo_reward_func/mean": 0.3945986032485962,
|
|
"rewards/grpo_reward_func/std": 0.146220862865448,
|
|
"step": 232
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.640625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.84375,
|
|
"kl": 0.0007942042720969766,
|
|
"learning_rate": 3.407407407407407e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5876775.0,
|
|
"reward": 0.1004338338971138,
|
|
"reward_std": 0.12970568239688873,
|
|
"rewards/grpo_reward_func/mean": 0.1004338338971138,
|
|
"rewards/grpo_reward_func/std": 0.1417793482542038,
|
|
"step": 233
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.65625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 23.375,
|
|
"kl": 0.0014751525595784187,
|
|
"learning_rate": 3.398148148148148e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 5902023.0,
|
|
"reward": 0.22899229824543,
|
|
"reward_std": 0.10198648273944855,
|
|
"rewards/grpo_reward_func/mean": 0.22899229824543,
|
|
"rewards/grpo_reward_func/std": 0.13079826533794403,
|
|
"step": 234
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.671875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.46875,
|
|
"kl": 0.000608055226621218,
|
|
"learning_rate": 3.388888888888889e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5927263.0,
|
|
"reward": 0.29941701889038086,
|
|
"reward_std": 0.06719333678483963,
|
|
"rewards/grpo_reward_func/mean": 0.29941701889038086,
|
|
"rewards/grpo_reward_func/std": 0.14349378645420074,
|
|
"step": 235
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.6875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.875,
|
|
"kl": 0.0006101805192884058,
|
|
"learning_rate": 3.3796296296296295e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5951983.0,
|
|
"reward": 0.4177182912826538,
|
|
"reward_std": 0.15579620003700256,
|
|
"rewards/grpo_reward_func/mean": 0.4177182912826538,
|
|
"rewards/grpo_reward_func/std": 0.15117469429969788,
|
|
"step": 236
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.703125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.9375,
|
|
"kl": 0.0004981622769264504,
|
|
"learning_rate": 3.37037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5976487.0,
|
|
"reward": 0.3275076150894165,
|
|
"reward_std": 0.16276490688323975,
|
|
"rewards/grpo_reward_func/mean": 0.3275076150894165,
|
|
"rewards/grpo_reward_func/std": 0.1577589213848114,
|
|
"step": 237
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.71875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 20.0,
|
|
"kl": 0.0013415751745924354,
|
|
"learning_rate": 3.361111111111111e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 6001479.0,
|
|
"reward": 0.37890833616256714,
|
|
"reward_std": 0.18145695328712463,
|
|
"rewards/grpo_reward_func/mean": 0.37890833616256714,
|
|
"rewards/grpo_reward_func/std": 0.17795169353485107,
|
|
"step": 238
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.734375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.0,
|
|
"kl": 0.0011636121198534966,
|
|
"learning_rate": 3.351851851851852e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6027183.0,
|
|
"reward": 0.1332564800977707,
|
|
"reward_std": 0.224045991897583,
|
|
"rewards/grpo_reward_func/mean": 0.1332564800977707,
|
|
"rewards/grpo_reward_func/std": 0.21909579634666443,
|
|
"step": 239
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.75,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.8125,
|
|
"kl": 0.0012435338867362589,
|
|
"learning_rate": 3.3425925925925925e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6052111.0,
|
|
"reward": 0.27287042140960693,
|
|
"reward_std": 0.15101364254951477,
|
|
"rewards/grpo_reward_func/mean": 0.27287042140960693,
|
|
"rewards/grpo_reward_func/std": 0.2304336130619049,
|
|
"step": 240
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.765625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.25,
|
|
"kl": 0.0005685510259354487,
|
|
"learning_rate": 3.333333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6076831.0,
|
|
"reward": 0.47098228335380554,
|
|
"reward_std": 0.08635647594928741,
|
|
"rewards/grpo_reward_func/mean": 0.47098228335380554,
|
|
"rewards/grpo_reward_func/std": 0.10293111950159073,
|
|
"step": 241
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.78125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.78125,
|
|
"kl": 0.0018707392737269402,
|
|
"learning_rate": 3.324074074074074e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 6102039.0,
|
|
"reward": 0.36166447401046753,
|
|
"reward_std": 0.06935366988182068,
|
|
"rewards/grpo_reward_func/mean": 0.36166447401046753,
|
|
"rewards/grpo_reward_func/std": 0.134328693151474,
|
|
"step": 242
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.796875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.25,
|
|
"kl": 0.0011540545820025727,
|
|
"learning_rate": 3.3148148148148145e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6126983.0,
|
|
"reward": 0.3171887695789337,
|
|
"reward_std": 0.08502347022294998,
|
|
"rewards/grpo_reward_func/mean": 0.3171887695789337,
|
|
"rewards/grpo_reward_func/std": 0.09260429441928864,
|
|
"step": 243
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.8125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.1875,
|
|
"kl": 0.0011697566660586745,
|
|
"learning_rate": 3.3055555555555556e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6152295.0,
|
|
"reward": 0.2625499367713928,
|
|
"reward_std": 0.09874355047941208,
|
|
"rewards/grpo_reward_func/mean": 0.2625499367713928,
|
|
"rewards/grpo_reward_func/std": 0.2084723711013794,
|
|
"step": 244
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.828125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.125,
|
|
"kl": 0.0007433524879161268,
|
|
"learning_rate": 3.296296296296296e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6177527.0,
|
|
"reward": 0.27035290002822876,
|
|
"reward_std": 0.04757823050022125,
|
|
"rewards/grpo_reward_func/mean": 0.27035290002822876,
|
|
"rewards/grpo_reward_func/std": 0.08030013740062714,
|
|
"step": 245
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.84375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.625,
|
|
"kl": 0.0008057684899540618,
|
|
"learning_rate": 3.287037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6202639.0,
|
|
"reward": 0.3587920367717743,
|
|
"reward_std": 0.16274358332157135,
|
|
"rewards/grpo_reward_func/mean": 0.3587920367717743,
|
|
"rewards/grpo_reward_func/std": 0.1588505208492279,
|
|
"step": 246
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.859375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.125,
|
|
"kl": 0.0006251692102523521,
|
|
"learning_rate": 3.2777777777777776e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6227983.0,
|
|
"reward": 0.2578817307949066,
|
|
"reward_std": 0.14186282455921173,
|
|
"rewards/grpo_reward_func/mean": 0.2578817307949066,
|
|
"rewards/grpo_reward_func/std": 0.23546750843524933,
|
|
"step": 247
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.6875,
|
|
"kl": 0.000663579732645303,
|
|
"learning_rate": 3.268518518518518e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6254367.0,
|
|
"reward": 0.0007088836282491684,
|
|
"reward_std": 0.09014703333377838,
|
|
"rewards/grpo_reward_func/mean": 0.0007088836282491684,
|
|
"rewards/grpo_reward_func/std": 0.09519969671964645,
|
|
"step": 248
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.890625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.0,
|
|
"kl": 0.0010352494718972594,
|
|
"learning_rate": 3.2592592592592596e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6279463.0,
|
|
"reward": 0.3615862727165222,
|
|
"reward_std": 0.10009762644767761,
|
|
"rewards/grpo_reward_func/mean": 0.3615862727165222,
|
|
"rewards/grpo_reward_func/std": 0.108461894094944,
|
|
"step": 249
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.90625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.6875,
|
|
"kl": 0.0012366212613414973,
|
|
"learning_rate": 3.25e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6304359.0,
|
|
"reward": 0.32284611463546753,
|
|
"reward_std": 0.049088191241025925,
|
|
"rewards/grpo_reward_func/mean": 0.32284611463546753,
|
|
"rewards/grpo_reward_func/std": 0.07815742492675781,
|
|
"step": 250
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.921875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.625,
|
|
"kl": 0.0005682266055373475,
|
|
"learning_rate": 3.2407407407407406e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6329879.0,
|
|
"reward": 0.1265973150730133,
|
|
"reward_std": 0.10245135426521301,
|
|
"rewards/grpo_reward_func/mean": 0.1265973150730133,
|
|
"rewards/grpo_reward_func/std": 0.14798611402511597,
|
|
"step": 251
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.9375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.34375,
|
|
"kl": 0.0005744351219618693,
|
|
"learning_rate": 3.231481481481481e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6354855.0,
|
|
"reward": 0.3396564722061157,
|
|
"reward_std": 0.05385906249284744,
|
|
"rewards/grpo_reward_func/mean": 0.3396564722061157,
|
|
"rewards/grpo_reward_func/std": 0.052948247641325,
|
|
"step": 252
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.953125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.3125,
|
|
"kl": 0.0011231331154704094,
|
|
"learning_rate": 3.222222222222222e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6379831.0,
|
|
"reward": 0.3173307776451111,
|
|
"reward_std": 0.07542085647583008,
|
|
"rewards/grpo_reward_func/mean": 0.3173307776451111,
|
|
"rewards/grpo_reward_func/std": 0.10744292289018631,
|
|
"step": 253
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.96875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.8125,
|
|
"kl": 0.0005484645516844466,
|
|
"learning_rate": 3.2129629629629626e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6404911.0,
|
|
"reward": 0.3487330675125122,
|
|
"reward_std": 0.1991029679775238,
|
|
"rewards/grpo_reward_func/mean": 0.3487330675125122,
|
|
"rewards/grpo_reward_func/std": 0.23129069805145264,
|
|
"step": 254
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 3.984375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.75,
|
|
"kl": 0.0005995733808958903,
|
|
"learning_rate": 3.2037037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6429919.0,
|
|
"reward": 0.27099573612213135,
|
|
"reward_std": 0.12892566621303558,
|
|
"rewards/grpo_reward_func/mean": 0.27099573612213135,
|
|
"rewards/grpo_reward_func/std": 0.1356169879436493,
|
|
"step": 255
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.0,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.0,
|
|
"kl": 0.0009363433055114001,
|
|
"learning_rate": 3.194444444444444e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6455263.0,
|
|
"reward": 0.2379104644060135,
|
|
"reward_std": 0.13427025079727173,
|
|
"rewards/grpo_reward_func/mean": 0.2379104644060135,
|
|
"rewards/grpo_reward_func/std": 0.15128843486309052,
|
|
"step": 256
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.015625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.8125,
|
|
"kl": 0.0011603829334490001,
|
|
"learning_rate": 3.185185185185185e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6480735.0,
|
|
"reward": 0.2011098861694336,
|
|
"reward_std": 0.1447315365076065,
|
|
"rewards/grpo_reward_func/mean": 0.2011098861694336,
|
|
"rewards/grpo_reward_func/std": 0.2299196422100067,
|
|
"step": 257
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.03125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.1875,
|
|
"kl": 0.0019025284273084253,
|
|
"learning_rate": 3.1759259259259257e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 6505487.0,
|
|
"reward": 0.38076773285865784,
|
|
"reward_std": 0.062072522938251495,
|
|
"rewards/grpo_reward_func/mean": 0.38076773285865784,
|
|
"rewards/grpo_reward_func/std": 0.060071974992752075,
|
|
"step": 258
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.046875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.25,
|
|
"kl": 0.0004512484447332099,
|
|
"learning_rate": 3.166666666666666e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6530567.0,
|
|
"reward": 0.24981309473514557,
|
|
"reward_std": 0.1037866473197937,
|
|
"rewards/grpo_reward_func/mean": 0.24981309473514557,
|
|
"rewards/grpo_reward_func/std": 0.14956361055374146,
|
|
"step": 259
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.0625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.625,
|
|
"kl": 0.0010139914229512215,
|
|
"learning_rate": 3.1574074074074077e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6555831.0,
|
|
"reward": 0.21904444694519043,
|
|
"reward_std": 0.07102406024932861,
|
|
"rewards/grpo_reward_func/mean": 0.21904444694519043,
|
|
"rewards/grpo_reward_func/std": 0.17433929443359375,
|
|
"step": 260
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.078125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.9375,
|
|
"kl": 0.0008480849792249501,
|
|
"learning_rate": 3.148148148148148e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6580823.0,
|
|
"reward": 0.3976954519748688,
|
|
"reward_std": 0.14704205095767975,
|
|
"rewards/grpo_reward_func/mean": 0.3976954519748688,
|
|
"rewards/grpo_reward_func/std": 0.14226453006267548,
|
|
"step": 261
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.09375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.4375,
|
|
"kl": 0.0018108648364432156,
|
|
"learning_rate": 3.1388888888888887e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 6605631.0,
|
|
"reward": 0.27959388494491577,
|
|
"reward_std": 0.1226632297039032,
|
|
"rewards/grpo_reward_func/mean": 0.27959388494491577,
|
|
"rewards/grpo_reward_func/std": 0.13256776332855225,
|
|
"step": 262
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.109375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.875,
|
|
"kl": 0.0017761494382284582,
|
|
"learning_rate": 3.129629629629629e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 6630615.0,
|
|
"reward": 0.3720259368419647,
|
|
"reward_std": 0.08527237176895142,
|
|
"rewards/grpo_reward_func/mean": 0.3720259368419647,
|
|
"rewards/grpo_reward_func/std": 0.09316051751375198,
|
|
"step": 263
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.5625,
|
|
"kl": 0.0005580664874287322,
|
|
"learning_rate": 3.12037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6656335.0,
|
|
"reward": 0.120351143181324,
|
|
"reward_std": 0.06724663823843002,
|
|
"rewards/grpo_reward_func/mean": 0.120351143181324,
|
|
"rewards/grpo_reward_func/std": 0.12132058292627335,
|
|
"step": 264
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.140625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.0,
|
|
"kl": 0.0005984306335449219,
|
|
"learning_rate": 3.111111111111111e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6681695.0,
|
|
"reward": 0.21656858921051025,
|
|
"reward_std": 0.10678647458553314,
|
|
"rewards/grpo_reward_func/mean": 0.21656858921051025,
|
|
"rewards/grpo_reward_func/std": 0.15582096576690674,
|
|
"step": 265
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.15625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.8125,
|
|
"kl": 0.0001807762309908867,
|
|
"learning_rate": 3.101851851851852e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6707239.0,
|
|
"reward": 0.213043212890625,
|
|
"reward_std": 0.09147733449935913,
|
|
"rewards/grpo_reward_func/mean": 0.213043212890625,
|
|
"rewards/grpo_reward_func/std": 0.1813676506280899,
|
|
"step": 266
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.171875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.75,
|
|
"kl": 0.0008879285014700145,
|
|
"learning_rate": 3.092592592592592e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6732487.0,
|
|
"reward": 0.222677081823349,
|
|
"reward_std": 0.07209749519824982,
|
|
"rewards/grpo_reward_func/mean": 0.222677081823349,
|
|
"rewards/grpo_reward_func/std": 0.11454568058252335,
|
|
"step": 267
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.1875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.4375,
|
|
"kl": 0.0009467930940445513,
|
|
"learning_rate": 3.0833333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6758015.0,
|
|
"reward": 0.25332149863243103,
|
|
"reward_std": 0.12080815434455872,
|
|
"rewards/grpo_reward_func/mean": 0.25332149863243103,
|
|
"rewards/grpo_reward_func/std": 0.18324896693229675,
|
|
"step": 268
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.203125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.4375,
|
|
"kl": 0.0011847288988064975,
|
|
"learning_rate": 3.074074074074074e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6783375.0,
|
|
"reward": 0.1880032867193222,
|
|
"reward_std": 0.07594156265258789,
|
|
"rewards/grpo_reward_func/mean": 0.1880032867193222,
|
|
"rewards/grpo_reward_func/std": 0.14374983310699463,
|
|
"step": 269
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.21875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.375,
|
|
"kl": 0.0004900420753983781,
|
|
"learning_rate": 3.0648148148148143e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6808471.0,
|
|
"reward": 0.3218265175819397,
|
|
"reward_std": 0.07450239360332489,
|
|
"rewards/grpo_reward_func/mean": 0.3218265175819397,
|
|
"rewards/grpo_reward_func/std": 0.09696881473064423,
|
|
"step": 270
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.234375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.5,
|
|
"kl": 0.0003609297127695754,
|
|
"learning_rate": 3.055555555555556e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6833423.0,
|
|
"reward": 0.4676928222179413,
|
|
"reward_std": 0.11454892158508301,
|
|
"rewards/grpo_reward_func/mean": 0.4676928222179413,
|
|
"rewards/grpo_reward_func/std": 0.12146926671266556,
|
|
"step": 271
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.25,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.6875,
|
|
"kl": 0.000912386312847957,
|
|
"learning_rate": 3.0462962962962963e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6858519.0,
|
|
"reward": 0.2522871196269989,
|
|
"reward_std": 0.1315116584300995,
|
|
"rewards/grpo_reward_func/mean": 0.2522871196269989,
|
|
"rewards/grpo_reward_func/std": 0.18651296198368073,
|
|
"step": 272
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.265625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.4375,
|
|
"kl": 0.0004488583654165268,
|
|
"learning_rate": 3.037037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6883855.0,
|
|
"reward": 0.25175750255584717,
|
|
"reward_std": 0.11373959481716156,
|
|
"rewards/grpo_reward_func/mean": 0.25175750255584717,
|
|
"rewards/grpo_reward_func/std": 0.18083377182483673,
|
|
"step": 273
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.28125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.0,
|
|
"kl": 0.0008468221349176019,
|
|
"learning_rate": 3.0277777777777773e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6909191.0,
|
|
"reward": 0.28342726826667786,
|
|
"reward_std": 0.16340333223342896,
|
|
"rewards/grpo_reward_func/mean": 0.28342726826667786,
|
|
"rewards/grpo_reward_func/std": 0.19860301911830902,
|
|
"step": 274
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.296875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.25,
|
|
"kl": 0.0004914179589832202,
|
|
"learning_rate": 3.0185185185185183e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6934383.0,
|
|
"reward": 0.3115350604057312,
|
|
"reward_std": 0.13271969556808472,
|
|
"rewards/grpo_reward_func/mean": 0.3115350604057312,
|
|
"rewards/grpo_reward_func/std": 0.14835800230503082,
|
|
"step": 275
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.3125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.03125,
|
|
"kl": 0.0005301498022163287,
|
|
"learning_rate": 3.0092592592592594e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6959679.0,
|
|
"reward": 0.22619304060935974,
|
|
"reward_std": 0.052596937865018845,
|
|
"rewards/grpo_reward_func/mean": 0.22619304060935974,
|
|
"rewards/grpo_reward_func/std": 0.06435148417949677,
|
|
"step": 276
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.328125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.5,
|
|
"kl": 0.0004184702556813136,
|
|
"learning_rate": 3e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6984911.0,
|
|
"reward": 0.2275838553905487,
|
|
"reward_std": 0.07728221267461777,
|
|
"rewards/grpo_reward_func/mean": 0.2275838553905487,
|
|
"rewards/grpo_reward_func/std": 0.11855830997228622,
|
|
"step": 277
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.34375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.9375,
|
|
"kl": 0.0025757864059414715,
|
|
"learning_rate": 2.9907407407407404e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 7009823.0,
|
|
"reward": 0.37820249795913696,
|
|
"reward_std": 0.10672685503959656,
|
|
"rewards/grpo_reward_func/mean": 0.37820249795913696,
|
|
"rewards/grpo_reward_func/std": 0.1123933345079422,
|
|
"step": 278
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.359375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.0625,
|
|
"kl": 0.0011405842669773847,
|
|
"learning_rate": 2.9814814814814814e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7035151.0,
|
|
"reward": 0.0609009750187397,
|
|
"reward_std": 0.05970199033617973,
|
|
"rewards/grpo_reward_func/mean": 0.0609009750187397,
|
|
"rewards/grpo_reward_func/std": 0.0622292160987854,
|
|
"step": 279
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.1875,
|
|
"kl": 0.0007134604675229639,
|
|
"learning_rate": 2.972222222222222e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7060359.0,
|
|
"reward": 0.28891313076019287,
|
|
"reward_std": 0.12801918387413025,
|
|
"rewards/grpo_reward_func/mean": 0.28891313076019287,
|
|
"rewards/grpo_reward_func/std": 0.13185609877109528,
|
|
"step": 280
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.390625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.25,
|
|
"kl": 0.0006706975400447845,
|
|
"learning_rate": 2.962962962962963e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7085407.0,
|
|
"reward": 0.27223989367485046,
|
|
"reward_std": 0.07281184196472168,
|
|
"rewards/grpo_reward_func/mean": 0.27223989367485046,
|
|
"rewards/grpo_reward_func/std": 0.07228488475084305,
|
|
"step": 281
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.40625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.6875,
|
|
"kl": 0.0007390764949377626,
|
|
"learning_rate": 2.953703703703704e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7110727.0,
|
|
"reward": 0.3155236542224884,
|
|
"reward_std": 0.08076095581054688,
|
|
"rewards/grpo_reward_func/mean": 0.3155236542224884,
|
|
"rewards/grpo_reward_func/std": 0.1124795451760292,
|
|
"step": 282
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.421875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.5625,
|
|
"kl": 0.0005147006886545569,
|
|
"learning_rate": 2.9444444444444444e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7135567.0,
|
|
"reward": 0.3611481189727783,
|
|
"reward_std": 0.14192907512187958,
|
|
"rewards/grpo_reward_func/mean": 0.3611481189727783,
|
|
"rewards/grpo_reward_func/std": 0.17185887694358826,
|
|
"step": 283
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.4375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.375,
|
|
"kl": 0.0007455420272890478,
|
|
"learning_rate": 2.935185185185185e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7160631.0,
|
|
"reward": 0.2531818449497223,
|
|
"reward_std": 0.11840977519750595,
|
|
"rewards/grpo_reward_func/mean": 0.2531818449497223,
|
|
"rewards/grpo_reward_func/std": 0.14898955821990967,
|
|
"step": 284
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.453125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.9375,
|
|
"kl": 0.0006515182030852884,
|
|
"learning_rate": 2.9259259259259254e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7185359.0,
|
|
"reward": 0.4088771343231201,
|
|
"reward_std": 0.09822411835193634,
|
|
"rewards/grpo_reward_func/mean": 0.4088771343231201,
|
|
"rewards/grpo_reward_func/std": 0.12453342974185944,
|
|
"step": 285
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.46875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.625,
|
|
"kl": 0.0017640814476180822,
|
|
"learning_rate": 2.916666666666667e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 7211087.0,
|
|
"reward": 0.21529509127140045,
|
|
"reward_std": 0.14051379263401031,
|
|
"rewards/grpo_reward_func/mean": 0.21529509127140045,
|
|
"rewards/grpo_reward_func/std": 0.22462895512580872,
|
|
"step": 286
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.484375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.0625,
|
|
"kl": 0.0016883965581655502,
|
|
"learning_rate": 2.9074074074074075e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 7236343.0,
|
|
"reward": 0.19085359573364258,
|
|
"reward_std": 0.07424027472734451,
|
|
"rewards/grpo_reward_func/mean": 0.19085359573364258,
|
|
"rewards/grpo_reward_func/std": 0.14520986378192902,
|
|
"step": 287
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.5,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.6875,
|
|
"kl": 0.0005011484026908875,
|
|
"learning_rate": 2.898148148148148e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7261687.0,
|
|
"reward": 0.23781077563762665,
|
|
"reward_std": 0.1257193684577942,
|
|
"rewards/grpo_reward_func/mean": 0.23781077563762665,
|
|
"rewards/grpo_reward_func/std": 0.2135416567325592,
|
|
"step": 288
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.515625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.1875,
|
|
"kl": 0.001285669393837452,
|
|
"learning_rate": 2.8888888888888885e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 7286671.0,
|
|
"reward": 0.42376312613487244,
|
|
"reward_std": 0.1314304769039154,
|
|
"rewards/grpo_reward_func/mean": 0.42376312613487244,
|
|
"rewards/grpo_reward_func/std": 0.13118663430213928,
|
|
"step": 289
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.53125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.9375,
|
|
"kl": 0.0010273307852912694,
|
|
"learning_rate": 2.8796296296296295e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7312031.0,
|
|
"reward": 0.2674727439880371,
|
|
"reward_std": 0.10928401350975037,
|
|
"rewards/grpo_reward_func/mean": 0.2674727439880371,
|
|
"rewards/grpo_reward_func/std": 0.2467528134584427,
|
|
"step": 290
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.546875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.375,
|
|
"kl": 0.0015598470345139503,
|
|
"learning_rate": 2.87037037037037e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 7337039.0,
|
|
"reward": 0.355027437210083,
|
|
"reward_std": 0.1009407714009285,
|
|
"rewards/grpo_reward_func/mean": 0.355027437210083,
|
|
"rewards/grpo_reward_func/std": 0.10701252520084381,
|
|
"step": 291
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.5625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.8125,
|
|
"kl": 0.0006988458335399628,
|
|
"learning_rate": 2.861111111111111e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7362615.0,
|
|
"reward": 0.3158547878265381,
|
|
"reward_std": 0.10899890214204788,
|
|
"rewards/grpo_reward_func/mean": 0.3158547878265381,
|
|
"rewards/grpo_reward_func/std": 0.24725650250911713,
|
|
"step": 292
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.578125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.0,
|
|
"kl": 0.00035879015194950625,
|
|
"learning_rate": 2.851851851851852e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7387551.0,
|
|
"reward": 0.4256178140640259,
|
|
"reward_std": 0.11733870208263397,
|
|
"rewards/grpo_reward_func/mean": 0.4256178140640259,
|
|
"rewards/grpo_reward_func/std": 0.11450333893299103,
|
|
"step": 293
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.59375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.8125,
|
|
"kl": 0.0016969367861747742,
|
|
"learning_rate": 2.8425925925925925e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 7412687.0,
|
|
"reward": 0.29177987575531006,
|
|
"reward_std": 0.09220882505178452,
|
|
"rewards/grpo_reward_func/mean": 0.29177987575531006,
|
|
"rewards/grpo_reward_func/std": 0.2229277640581131,
|
|
"step": 294
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.609375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.75,
|
|
"kl": 0.0008428776636719704,
|
|
"learning_rate": 2.833333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7438223.0,
|
|
"reward": 0.24013623595237732,
|
|
"reward_std": 0.07828111946582794,
|
|
"rewards/grpo_reward_func/mean": 0.24013623595237732,
|
|
"rewards/grpo_reward_func/std": 0.20988282561302185,
|
|
"step": 295
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.8125,
|
|
"kl": 0.0015230309218168259,
|
|
"learning_rate": 2.8240740740740735e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 7463023.0,
|
|
"reward": 0.2694811224937439,
|
|
"reward_std": 0.1517883688211441,
|
|
"rewards/grpo_reward_func/mean": 0.2694811224937439,
|
|
"rewards/grpo_reward_func/std": 0.1755567044019699,
|
|
"step": 296
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.640625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.5,
|
|
"kl": 0.0009139720350503922,
|
|
"learning_rate": 2.814814814814815e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7488479.0,
|
|
"reward": 0.21519726514816284,
|
|
"reward_std": 0.12513023614883423,
|
|
"rewards/grpo_reward_func/mean": 0.21519726514816284,
|
|
"rewards/grpo_reward_func/std": 0.15375806391239166,
|
|
"step": 297
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.65625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.28125,
|
|
"kl": 0.00033611089747864753,
|
|
"learning_rate": 2.8055555555555556e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7513439.0,
|
|
"reward": 0.29541122913360596,
|
|
"reward_std": 0.06453363597393036,
|
|
"rewards/grpo_reward_func/mean": 0.29541122913360596,
|
|
"rewards/grpo_reward_func/std": 0.07813195884227753,
|
|
"step": 298
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.671875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.1875,
|
|
"kl": 0.001329958438873291,
|
|
"learning_rate": 2.796296296296296e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 7538879.0,
|
|
"reward": 0.19282189011573792,
|
|
"reward_std": 0.15989510715007782,
|
|
"rewards/grpo_reward_func/mean": 0.19282189011573792,
|
|
"rewards/grpo_reward_func/std": 0.1697288304567337,
|
|
"step": 299
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.6875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.5,
|
|
"kl": 0.0011378759809304029,
|
|
"learning_rate": 2.787037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7563959.0,
|
|
"reward": 0.4194362163543701,
|
|
"reward_std": 0.1440151333808899,
|
|
"rewards/grpo_reward_func/mean": 0.4194362163543701,
|
|
"rewards/grpo_reward_func/std": 0.23242692649364471,
|
|
"step": 300
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.703125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.8125,
|
|
"kl": 0.0009807453607209027,
|
|
"learning_rate": 2.7777777777777776e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7589055.0,
|
|
"reward": 0.13554009795188904,
|
|
"reward_std": 0.06755845993757248,
|
|
"rewards/grpo_reward_func/mean": 0.13554009795188904,
|
|
"rewards/grpo_reward_func/std": 0.18186631798744202,
|
|
"step": 301
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.71875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.3125,
|
|
"kl": 0.0010332918318454176,
|
|
"learning_rate": 2.7685185185185186e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7614335.0,
|
|
"reward": 0.24098092317581177,
|
|
"reward_std": 0.0867777168750763,
|
|
"rewards/grpo_reward_func/mean": 0.24098092317581177,
|
|
"rewards/grpo_reward_func/std": 0.19634543359279633,
|
|
"step": 302
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.734375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.375,
|
|
"kl": 0.0010067789407912642,
|
|
"learning_rate": 2.759259259259259e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7639447.0,
|
|
"reward": 0.2790209650993347,
|
|
"reward_std": 0.11601302027702332,
|
|
"rewards/grpo_reward_func/mean": 0.2790209650993347,
|
|
"rewards/grpo_reward_func/std": 0.11709357798099518,
|
|
"step": 303
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.75,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.4375,
|
|
"kl": 0.0010244656878057867,
|
|
"learning_rate": 2.75e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7664423.0,
|
|
"reward": 0.400749534368515,
|
|
"reward_std": 0.08898760378360748,
|
|
"rewards/grpo_reward_func/mean": 0.400749534368515,
|
|
"rewards/grpo_reward_func/std": 0.19838882982730865,
|
|
"step": 304
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.765625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.625,
|
|
"kl": 0.001470650837291032,
|
|
"learning_rate": 2.7407407407407406e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 7689567.0,
|
|
"reward": 0.2512458562850952,
|
|
"reward_std": 0.12142281234264374,
|
|
"rewards/grpo_reward_func/mean": 0.2512458562850952,
|
|
"rewards/grpo_reward_func/std": 0.1289859265089035,
|
|
"step": 305
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.78125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.875,
|
|
"kl": 0.0010143139807041734,
|
|
"learning_rate": 2.731481481481481e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7715719.0,
|
|
"reward": 0.07891548424959183,
|
|
"reward_std": 0.07104109227657318,
|
|
"rewards/grpo_reward_func/mean": 0.07891548424959183,
|
|
"rewards/grpo_reward_func/std": 0.07307452708482742,
|
|
"step": 306
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.796875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.625,
|
|
"kl": 0.001021136820781976,
|
|
"learning_rate": 2.7222222222222216e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7741599.0,
|
|
"reward": 0.08877018094062805,
|
|
"reward_std": 0.10754196345806122,
|
|
"rewards/grpo_reward_func/mean": 0.08877018094062805,
|
|
"rewards/grpo_reward_func/std": 0.11736486107110977,
|
|
"step": 307
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.8125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.875,
|
|
"kl": 0.0023402251536026597,
|
|
"learning_rate": 2.712962962962963e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 7766439.0,
|
|
"reward": 0.3745066523551941,
|
|
"reward_std": 0.10392110794782639,
|
|
"rewards/grpo_reward_func/mean": 0.3745066523551941,
|
|
"rewards/grpo_reward_func/std": 0.1060907319188118,
|
|
"step": 308
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.828125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.375,
|
|
"kl": 0.0014679792220704257,
|
|
"learning_rate": 2.7037037037037037e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 7792415.0,
|
|
"reward": 0.05516662448644638,
|
|
"reward_std": 0.1237877607345581,
|
|
"rewards/grpo_reward_func/mean": 0.05516662448644638,
|
|
"rewards/grpo_reward_func/std": 0.14112551510334015,
|
|
"step": 309
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.84375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.5,
|
|
"kl": 0.0011287930537946522,
|
|
"learning_rate": 2.694444444444444e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7817583.0,
|
|
"reward": 0.2898673713207245,
|
|
"reward_std": 0.14550068974494934,
|
|
"rewards/grpo_reward_func/mean": 0.2898673713207245,
|
|
"rewards/grpo_reward_func/std": 0.16415542364120483,
|
|
"step": 310
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.859375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.8125,
|
|
"kl": 0.0006401048449333757,
|
|
"learning_rate": 2.685185185185185e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7842599.0,
|
|
"reward": 0.3488427698612213,
|
|
"reward_std": 0.17175181210041046,
|
|
"rewards/grpo_reward_func/mean": 0.3488427698612213,
|
|
"rewards/grpo_reward_func/std": 0.21280032396316528,
|
|
"step": 311
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.9375,
|
|
"kl": 0.0005905106663703918,
|
|
"learning_rate": 2.6759259259259257e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7868007.0,
|
|
"reward": 0.3986209034919739,
|
|
"reward_std": 0.15302179753780365,
|
|
"rewards/grpo_reward_func/mean": 0.3986209034919739,
|
|
"rewards/grpo_reward_func/std": 0.22395354509353638,
|
|
"step": 312
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.890625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.875,
|
|
"kl": 0.0009062414173968136,
|
|
"learning_rate": 2.6666666666666667e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7893263.0,
|
|
"reward": 0.224882572889328,
|
|
"reward_std": 0.11343192309141159,
|
|
"rewards/grpo_reward_func/mean": 0.224882572889328,
|
|
"rewards/grpo_reward_func/std": 0.187125563621521,
|
|
"step": 313
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.90625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.3125,
|
|
"kl": 0.0009414016676601022,
|
|
"learning_rate": 2.657407407407407e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7918671.0,
|
|
"reward": 0.24882347881793976,
|
|
"reward_std": 0.135990172624588,
|
|
"rewards/grpo_reward_func/mean": 0.24882347881793976,
|
|
"rewards/grpo_reward_func/std": 0.24125628173351288,
|
|
"step": 314
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.921875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.1875,
|
|
"kl": 0.0013116998597979546,
|
|
"learning_rate": 2.648148148148148e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 7944103.0,
|
|
"reward": 0.2479252815246582,
|
|
"reward_std": 0.14946752786636353,
|
|
"rewards/grpo_reward_func/mean": 0.2479252815246582,
|
|
"rewards/grpo_reward_func/std": 0.2719341218471527,
|
|
"step": 315
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.9375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.125,
|
|
"kl": 0.0005858497024746612,
|
|
"learning_rate": 2.638888888888889e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7969271.0,
|
|
"reward": 0.41760867834091187,
|
|
"reward_std": 0.1600833237171173,
|
|
"rewards/grpo_reward_func/mean": 0.41760867834091187,
|
|
"rewards/grpo_reward_func/std": 0.19628752768039703,
|
|
"step": 316
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.953125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.875,
|
|
"kl": 0.0008220685122068971,
|
|
"learning_rate": 2.629629629629629e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7993999.0,
|
|
"reward": 0.35043102502822876,
|
|
"reward_std": 0.09396857023239136,
|
|
"rewards/grpo_reward_func/mean": 0.35043102502822876,
|
|
"rewards/grpo_reward_func/std": 0.1028953343629837,
|
|
"step": 317
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.96875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.125,
|
|
"kl": 0.0007283634913619608,
|
|
"learning_rate": 2.62037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8019175.0,
|
|
"reward": 0.4264575242996216,
|
|
"reward_std": 0.07613471150398254,
|
|
"rewards/grpo_reward_func/mean": 0.4264575242996216,
|
|
"rewards/grpo_reward_func/std": 0.23587268590927124,
|
|
"step": 318
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 4.984375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.3125,
|
|
"kl": 0.001260551915038377,
|
|
"learning_rate": 2.6111111111111113e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 8044359.0,
|
|
"reward": 0.3039223551750183,
|
|
"reward_std": 0.1887253224849701,
|
|
"rewards/grpo_reward_func/mean": 0.3039223551750183,
|
|
"rewards/grpo_reward_func/std": 0.24560463428497314,
|
|
"step": 319
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.0,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.0625,
|
|
"kl": 0.0006503397598862648,
|
|
"learning_rate": 2.601851851851852e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8069079.0,
|
|
"reward": 0.47094419598579407,
|
|
"reward_std": 0.1439197063446045,
|
|
"rewards/grpo_reward_func/mean": 0.47094419598579407,
|
|
"rewards/grpo_reward_func/std": 0.1566508263349533,
|
|
"step": 320
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.015625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.5,
|
|
"kl": 0.0005181074739084579,
|
|
"learning_rate": 2.5925925925925923e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8094695.0,
|
|
"reward": 0.21605555713176727,
|
|
"reward_std": 0.04700346663594246,
|
|
"rewards/grpo_reward_func/mean": 0.21605555713176727,
|
|
"rewards/grpo_reward_func/std": 0.16210661828517914,
|
|
"step": 321
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.03125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.75,
|
|
"kl": 0.000648934073979035,
|
|
"learning_rate": 2.5833333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8119727.0,
|
|
"reward": 0.38052335381507874,
|
|
"reward_std": 0.15309563279151917,
|
|
"rewards/grpo_reward_func/mean": 0.38052335381507874,
|
|
"rewards/grpo_reward_func/std": 0.1593683809041977,
|
|
"step": 322
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.046875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.6875,
|
|
"kl": 0.0004919447528664023,
|
|
"learning_rate": 2.574074074074074e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8144823.0,
|
|
"reward": 0.2545957565307617,
|
|
"reward_std": 0.09698673337697983,
|
|
"rewards/grpo_reward_func/mean": 0.2545957565307617,
|
|
"rewards/grpo_reward_func/std": 0.14691407978534698,
|
|
"step": 323
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.0625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.0,
|
|
"kl": 0.0005518794059753418,
|
|
"learning_rate": 2.564814814814815e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8170231.0,
|
|
"reward": 0.2633800804615021,
|
|
"reward_std": 0.1403658092021942,
|
|
"rewards/grpo_reward_func/mean": 0.2633800804615021,
|
|
"rewards/grpo_reward_func/std": 0.20393338799476624,
|
|
"step": 324
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.078125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.0625,
|
|
"kl": 0.0013933554291725159,
|
|
"learning_rate": 2.5555555555555553e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 8195255.0,
|
|
"reward": 0.4053245782852173,
|
|
"reward_std": 0.13726326823234558,
|
|
"rewards/grpo_reward_func/mean": 0.4053245782852173,
|
|
"rewards/grpo_reward_func/std": 0.23320239782333374,
|
|
"step": 325
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.09375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 25.625,
|
|
"kl": 0.002304654335603118,
|
|
"learning_rate": 2.5462962962962963e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 8220567.0,
|
|
"reward": 0.28461790084838867,
|
|
"reward_std": 0.2090614289045334,
|
|
"rewards/grpo_reward_func/mean": 0.28461790084838867,
|
|
"rewards/grpo_reward_func/std": 0.253989040851593,
|
|
"step": 326
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.109375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.5,
|
|
"kl": 0.0005805188266094774,
|
|
"learning_rate": 2.537037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8245655.0,
|
|
"reward": 0.281326562166214,
|
|
"reward_std": 0.10936335474252701,
|
|
"rewards/grpo_reward_func/mean": 0.281326562166214,
|
|
"rewards/grpo_reward_func/std": 0.13177289068698883,
|
|
"step": 327
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.625,
|
|
"kl": 0.0003259473742218688,
|
|
"learning_rate": 2.5277777777777773e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8270591.0,
|
|
"reward": 0.38816237449645996,
|
|
"reward_std": 0.13367968797683716,
|
|
"rewards/grpo_reward_func/mean": 0.38816237449645996,
|
|
"rewards/grpo_reward_func/std": 0.17273396253585815,
|
|
"step": 328
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.140625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.375,
|
|
"kl": 0.0018250771681778133,
|
|
"learning_rate": 2.5185185185185184e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 8295767.0,
|
|
"reward": 0.36079350113868713,
|
|
"reward_std": 0.11437784135341644,
|
|
"rewards/grpo_reward_func/mean": 0.36079350113868713,
|
|
"rewards/grpo_reward_func/std": 0.21398717164993286,
|
|
"step": 329
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.15625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.3125,
|
|
"kl": 0.002108390093781054,
|
|
"learning_rate": 2.5092592592592594e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 8320431.0,
|
|
"reward": 0.3810342848300934,
|
|
"reward_std": 0.08449837565422058,
|
|
"rewards/grpo_reward_func/mean": 0.3810342848300934,
|
|
"rewards/grpo_reward_func/std": 0.0880691260099411,
|
|
"step": 330
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.171875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.5625,
|
|
"kl": 0.0014629397774115205,
|
|
"learning_rate": 2.5e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 8345951.0,
|
|
"reward": 0.2561107873916626,
|
|
"reward_std": 0.10452878475189209,
|
|
"rewards/grpo_reward_func/mean": 0.2561107873916626,
|
|
"rewards/grpo_reward_func/std": 0.13631132245063782,
|
|
"step": 331
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.1875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.6875,
|
|
"kl": 0.0007190862525021657,
|
|
"learning_rate": 2.490740740740741e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8371799.0,
|
|
"reward": 0.05727135390043259,
|
|
"reward_std": 0.08457481861114502,
|
|
"rewards/grpo_reward_func/mean": 0.05727135390043259,
|
|
"rewards/grpo_reward_func/std": 0.08320802450180054,
|
|
"step": 332
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.203125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.3125,
|
|
"kl": 0.0010623404232319444,
|
|
"learning_rate": 2.4814814814814814e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8396655.0,
|
|
"reward": 0.47699636220932007,
|
|
"reward_std": 0.1573687195777893,
|
|
"rewards/grpo_reward_func/mean": 0.47699636220932007,
|
|
"rewards/grpo_reward_func/std": 0.16918563842773438,
|
|
"step": 333
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.21875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.1875,
|
|
"kl": 0.0008317362517118454,
|
|
"learning_rate": 2.4722222222222224e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8422231.0,
|
|
"reward": 0.2612965404987335,
|
|
"reward_std": 0.10427425801753998,
|
|
"rewards/grpo_reward_func/mean": 0.2612965404987335,
|
|
"rewards/grpo_reward_func/std": 0.25590068101882935,
|
|
"step": 334
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.234375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.0625,
|
|
"kl": 0.0012046831543557346,
|
|
"learning_rate": 2.462962962962963e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8447175.0,
|
|
"reward": 0.4008851647377014,
|
|
"reward_std": 0.11680196225643158,
|
|
"rewards/grpo_reward_func/mean": 0.4008851647377014,
|
|
"rewards/grpo_reward_func/std": 0.13369765877723694,
|
|
"step": 335
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.25,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.0,
|
|
"kl": 0.0006340428517432883,
|
|
"learning_rate": 2.4537037037037034e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8472255.0,
|
|
"reward": 0.26478323340415955,
|
|
"reward_std": 0.16724863648414612,
|
|
"rewards/grpo_reward_func/mean": 0.26478323340415955,
|
|
"rewards/grpo_reward_func/std": 0.1973247230052948,
|
|
"step": 336
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.265625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.75,
|
|
"kl": 0.0011970326595474035,
|
|
"learning_rate": 2.4444444444444445e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8497263.0,
|
|
"reward": 0.374372661113739,
|
|
"reward_std": 0.08080107718706131,
|
|
"rewards/grpo_reward_func/mean": 0.374372661113739,
|
|
"rewards/grpo_reward_func/std": 0.08129201829433441,
|
|
"step": 337
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.28125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.75,
|
|
"kl": 0.0009607278334442526,
|
|
"learning_rate": 2.435185185185185e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8522511.0,
|
|
"reward": 0.24800701439380646,
|
|
"reward_std": 0.12350637465715408,
|
|
"rewards/grpo_reward_func/mean": 0.24800701439380646,
|
|
"rewards/grpo_reward_func/std": 0.18218368291854858,
|
|
"step": 338
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.296875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.9375,
|
|
"kl": 0.0007166365685407072,
|
|
"learning_rate": 2.425925925925926e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8547223.0,
|
|
"reward": 0.388899564743042,
|
|
"reward_std": 0.06452769041061401,
|
|
"rewards/grpo_reward_func/mean": 0.388899564743042,
|
|
"rewards/grpo_reward_func/std": 0.08408286422491074,
|
|
"step": 339
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.3125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.6875,
|
|
"kl": 0.0006827044708188623,
|
|
"learning_rate": 2.4166666666666665e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8572143.0,
|
|
"reward": 0.3447013795375824,
|
|
"reward_std": 0.10072646290063858,
|
|
"rewards/grpo_reward_func/mean": 0.3447013795375824,
|
|
"rewards/grpo_reward_func/std": 0.1320715993642807,
|
|
"step": 340
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.328125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.875,
|
|
"kl": 0.0012708511494565755,
|
|
"learning_rate": 2.407407407407407e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 8597503.0,
|
|
"reward": 0.33516252040863037,
|
|
"reward_std": 0.2073679268360138,
|
|
"rewards/grpo_reward_func/mean": 0.33516252040863037,
|
|
"rewards/grpo_reward_func/std": 0.23557628691196442,
|
|
"step": 341
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.34375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.1875,
|
|
"kl": 0.0013067865220364183,
|
|
"learning_rate": 2.398148148148148e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 8623423.0,
|
|
"reward": 0.06080477684736252,
|
|
"reward_std": 0.08706031739711761,
|
|
"rewards/grpo_reward_func/mean": 0.06080477684736252,
|
|
"rewards/grpo_reward_func/std": 0.08990643173456192,
|
|
"step": 342
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.359375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.1875,
|
|
"kl": 0.000486970558995381,
|
|
"learning_rate": 2.388888888888889e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8648583.0,
|
|
"reward": 0.4393516182899475,
|
|
"reward_std": 0.13304097950458527,
|
|
"rewards/grpo_reward_func/mean": 0.4393516182899475,
|
|
"rewards/grpo_reward_func/std": 0.13269037008285522,
|
|
"step": 343
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.9375,
|
|
"kl": 0.0003986669034929946,
|
|
"learning_rate": 2.3796296296296295e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8674007.0,
|
|
"reward": 0.1185535416007042,
|
|
"reward_std": 0.04521109163761139,
|
|
"rewards/grpo_reward_func/mean": 0.1185535416007042,
|
|
"rewards/grpo_reward_func/std": 0.10313326120376587,
|
|
"step": 344
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.390625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.9375,
|
|
"kl": 0.0008137710246955976,
|
|
"learning_rate": 2.3703703703703703e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8699695.0,
|
|
"reward": 0.22900578379631042,
|
|
"reward_std": 0.1365535855293274,
|
|
"rewards/grpo_reward_func/mean": 0.22900578379631042,
|
|
"rewards/grpo_reward_func/std": 0.18792560696601868,
|
|
"step": 345
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.40625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.0,
|
|
"kl": 0.0009939819865394384,
|
|
"learning_rate": 2.361111111111111e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8724831.0,
|
|
"reward": 0.29041871428489685,
|
|
"reward_std": 0.08149899542331696,
|
|
"rewards/grpo_reward_func/mean": 0.29041871428489685,
|
|
"rewards/grpo_reward_func/std": 0.219549298286438,
|
|
"step": 346
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.421875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.875,
|
|
"kl": 0.0010270678030792624,
|
|
"learning_rate": 2.3518518518518518e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8749951.0,
|
|
"reward": 0.19644547998905182,
|
|
"reward_std": 0.11144804954528809,
|
|
"rewards/grpo_reward_func/mean": 0.19644547998905182,
|
|
"rewards/grpo_reward_func/std": 0.15927433967590332,
|
|
"step": 347
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.4375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.4375,
|
|
"kl": 0.0012654773890972137,
|
|
"learning_rate": 2.3425925925925923e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 8774775.0,
|
|
"reward": 0.2843724489212036,
|
|
"reward_std": 0.099556565284729,
|
|
"rewards/grpo_reward_func/mean": 0.2843724489212036,
|
|
"rewards/grpo_reward_func/std": 0.10954099893569946,
|
|
"step": 348
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.453125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.3125,
|
|
"kl": 0.0010531196894589812,
|
|
"learning_rate": 2.3333333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8799855.0,
|
|
"reward": 0.37109872698783875,
|
|
"reward_std": 0.09316375851631165,
|
|
"rewards/grpo_reward_func/mean": 0.37109872698783875,
|
|
"rewards/grpo_reward_func/std": 0.0997348204255104,
|
|
"step": 349
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.46875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.0,
|
|
"kl": 0.001134138583438471,
|
|
"learning_rate": 2.3240740740740738e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8825103.0,
|
|
"reward": 0.2134397327899933,
|
|
"reward_std": 0.09246792644262314,
|
|
"rewards/grpo_reward_func/mean": 0.2134397327899933,
|
|
"rewards/grpo_reward_func/std": 0.15889793634414673,
|
|
"step": 350
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.484375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.4375,
|
|
"kl": 0.0015808992902748287,
|
|
"learning_rate": 2.3148148148148148e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 8849727.0,
|
|
"reward": 0.36531519889831543,
|
|
"reward_std": 0.14355136454105377,
|
|
"rewards/grpo_reward_func/mean": 0.36531519889831543,
|
|
"rewards/grpo_reward_func/std": 0.1389627456665039,
|
|
"step": 351
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.5,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.1875,
|
|
"kl": 0.0006994760187808424,
|
|
"learning_rate": 2.3055555555555556e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8874639.0,
|
|
"reward": 0.40682148933410645,
|
|
"reward_std": 0.13052034378051758,
|
|
"rewards/grpo_reward_func/mean": 0.40682148933410645,
|
|
"rewards/grpo_reward_func/std": 0.1289222538471222,
|
|
"step": 352
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.515625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.25,
|
|
"kl": 0.0006646249967161566,
|
|
"learning_rate": 2.296296296296296e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8899863.0,
|
|
"reward": 0.2974233031272888,
|
|
"reward_std": 0.15527892112731934,
|
|
"rewards/grpo_reward_func/mean": 0.2974233031272888,
|
|
"rewards/grpo_reward_func/std": 0.19951732456684113,
|
|
"step": 353
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.53125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.625,
|
|
"kl": 0.0009673306194599718,
|
|
"learning_rate": 2.287037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8925991.0,
|
|
"reward": 0.07634272426366806,
|
|
"reward_std": 0.06661258637905121,
|
|
"rewards/grpo_reward_func/mean": 0.07634272426366806,
|
|
"rewards/grpo_reward_func/std": 0.07981257140636444,
|
|
"step": 354
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.546875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.125,
|
|
"kl": 0.0010449464025441557,
|
|
"learning_rate": 2.2777777777777776e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8951031.0,
|
|
"reward": 0.37363359332084656,
|
|
"reward_std": 0.11995186656713486,
|
|
"rewards/grpo_reward_func/mean": 0.37363359332084656,
|
|
"rewards/grpo_reward_func/std": 0.16280175745487213,
|
|
"step": 355
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.5625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.1875,
|
|
"kl": 0.0011851430463138968,
|
|
"learning_rate": 2.2685185185185184e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8976327.0,
|
|
"reward": 0.29697471857070923,
|
|
"reward_std": 0.16333544254302979,
|
|
"rewards/grpo_reward_func/mean": 0.29697471857070923,
|
|
"rewards/grpo_reward_func/std": 0.17323781549930573,
|
|
"step": 356
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.578125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.25,
|
|
"kl": 0.0011026623542420566,
|
|
"learning_rate": 2.2592592592592591e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9001479.0,
|
|
"reward": 0.3700714707374573,
|
|
"reward_std": 0.11212660372257233,
|
|
"rewards/grpo_reward_func/mean": 0.3700714707374573,
|
|
"rewards/grpo_reward_func/std": 0.17391134798526764,
|
|
"step": 357
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.59375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.875,
|
|
"kl": 0.0008961235289461911,
|
|
"learning_rate": 2.25e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9026831.0,
|
|
"reward": 0.30335038900375366,
|
|
"reward_std": 0.16283930838108063,
|
|
"rewards/grpo_reward_func/mean": 0.30335038900375366,
|
|
"rewards/grpo_reward_func/std": 0.21482953429222107,
|
|
"step": 358
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.609375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.0625,
|
|
"kl": 0.001187270536320284,
|
|
"learning_rate": 2.2407407407407407e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9052111.0,
|
|
"reward": 0.2336607277393341,
|
|
"reward_std": 0.07578499615192413,
|
|
"rewards/grpo_reward_func/mean": 0.2336607277393341,
|
|
"rewards/grpo_reward_func/std": 0.2178594022989273,
|
|
"step": 359
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.5625,
|
|
"kl": 0.0016355722327716649,
|
|
"learning_rate": 2.2314814814814814e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 9077327.0,
|
|
"reward": 0.24882102012634277,
|
|
"reward_std": 0.10142374038696289,
|
|
"rewards/grpo_reward_func/mean": 0.24882102012634277,
|
|
"rewards/grpo_reward_func/std": 0.14036507904529572,
|
|
"step": 360
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.640625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.5625,
|
|
"kl": 0.0011905189749086276,
|
|
"learning_rate": 2.222222222222222e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9102855.0,
|
|
"reward": 0.19088464975357056,
|
|
"reward_std": 0.13671398162841797,
|
|
"rewards/grpo_reward_func/mean": 0.19088464975357056,
|
|
"rewards/grpo_reward_func/std": 0.19571226835250854,
|
|
"step": 361
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.65625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.875,
|
|
"kl": 0.0014347138931043446,
|
|
"learning_rate": 2.212962962962963e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 9127575.0,
|
|
"reward": 0.4702339172363281,
|
|
"reward_std": 0.13507477939128876,
|
|
"rewards/grpo_reward_func/mean": 0.4702339172363281,
|
|
"rewards/grpo_reward_func/std": 0.13804838061332703,
|
|
"step": 362
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.671875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.25,
|
|
"kl": 0.0011942110140807927,
|
|
"learning_rate": 2.2037037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9153375.0,
|
|
"reward": 0.08592602610588074,
|
|
"reward_std": 0.05403965711593628,
|
|
"rewards/grpo_reward_func/mean": 0.08592602610588074,
|
|
"rewards/grpo_reward_func/std": 0.05771300941705704,
|
|
"step": 363
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.6875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.5625,
|
|
"kl": 0.0017215957632288337,
|
|
"learning_rate": 2.1944444444444442e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 9178727.0,
|
|
"reward": 0.3241480886936188,
|
|
"reward_std": 0.09995287656784058,
|
|
"rewards/grpo_reward_func/mean": 0.3241480886936188,
|
|
"rewards/grpo_reward_func/std": 0.270202100276947,
|
|
"step": 364
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.703125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.8125,
|
|
"kl": 0.0006930825184099376,
|
|
"learning_rate": 2.1851851851851852e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9203919.0,
|
|
"reward": 0.33298927545547485,
|
|
"reward_std": 0.13145846128463745,
|
|
"rewards/grpo_reward_func/mean": 0.33298927545547485,
|
|
"rewards/grpo_reward_func/std": 0.1428404450416565,
|
|
"step": 365
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.71875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.71875,
|
|
"kl": 0.0006192661821842194,
|
|
"learning_rate": 2.1759259259259257e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9229119.0,
|
|
"reward": 0.24505698680877686,
|
|
"reward_std": 0.09642099589109421,
|
|
"rewards/grpo_reward_func/mean": 0.24505698680877686,
|
|
"rewards/grpo_reward_func/std": 0.098020538687706,
|
|
"step": 366
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.734375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.5,
|
|
"kl": 0.0006466656923294067,
|
|
"learning_rate": 2.1666666666666667e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9253975.0,
|
|
"reward": 0.38370344042778015,
|
|
"reward_std": 0.1567784547805786,
|
|
"rewards/grpo_reward_func/mean": 0.38370344042778015,
|
|
"rewards/grpo_reward_func/std": 0.15556758642196655,
|
|
"step": 367
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.75,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.4375,
|
|
"kl": 0.0005825981497764587,
|
|
"learning_rate": 2.1574074074074072e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9279695.0,
|
|
"reward": 0.11095234006643295,
|
|
"reward_std": 0.08616747707128525,
|
|
"rewards/grpo_reward_func/mean": 0.11095234006643295,
|
|
"rewards/grpo_reward_func/std": 0.1468985229730606,
|
|
"step": 368
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.765625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.375,
|
|
"kl": 0.0006324804126052186,
|
|
"learning_rate": 2.148148148148148e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9304551.0,
|
|
"reward": 0.3810734152793884,
|
|
"reward_std": 0.14643797278404236,
|
|
"rewards/grpo_reward_func/mean": 0.3810734152793884,
|
|
"rewards/grpo_reward_func/std": 0.19883529841899872,
|
|
"step": 369
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.78125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.1875,
|
|
"kl": 0.0006064654589863494,
|
|
"learning_rate": 2.1388888888888888e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9330063.0,
|
|
"reward": 0.20634137094020844,
|
|
"reward_std": 0.1969866156578064,
|
|
"rewards/grpo_reward_func/mean": 0.20634137094020844,
|
|
"rewards/grpo_reward_func/std": 0.275967001914978,
|
|
"step": 370
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.796875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.5625,
|
|
"kl": 0.0008783147786743939,
|
|
"learning_rate": 2.1296296296296295e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9355063.0,
|
|
"reward": 0.3551791310310364,
|
|
"reward_std": 0.0490216389298439,
|
|
"rewards/grpo_reward_func/mean": 0.3551791310310364,
|
|
"rewards/grpo_reward_func/std": 0.05635032430291176,
|
|
"step": 371
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.8125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.8125,
|
|
"kl": 0.0009447563061257824,
|
|
"learning_rate": 2.12037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9380719.0,
|
|
"reward": 0.13325411081314087,
|
|
"reward_std": 0.11700575053691864,
|
|
"rewards/grpo_reward_func/mean": 0.13325411081314087,
|
|
"rewards/grpo_reward_func/std": 0.12200622260570526,
|
|
"step": 372
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.828125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.6875,
|
|
"kl": 0.0015358021191786975,
|
|
"learning_rate": 2.111111111111111e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 9406015.0,
|
|
"reward": 0.1894848346710205,
|
|
"reward_std": 0.08906535059213638,
|
|
"rewards/grpo_reward_func/mean": 0.1894848346710205,
|
|
"rewards/grpo_reward_func/std": 0.11156714707612991,
|
|
"step": 373
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.84375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.0625,
|
|
"kl": 0.001031855761539191,
|
|
"learning_rate": 2.1018518518518518e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9431111.0,
|
|
"reward": 0.36022406816482544,
|
|
"reward_std": 0.0762445256114006,
|
|
"rewards/grpo_reward_func/mean": 0.36022406816482544,
|
|
"rewards/grpo_reward_func/std": 0.08881448209285736,
|
|
"step": 374
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.859375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.4375,
|
|
"kl": 0.0006837841647211462,
|
|
"learning_rate": 2.0925925925925926e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9455967.0,
|
|
"reward": 0.5004688501358032,
|
|
"reward_std": 0.1380087286233902,
|
|
"rewards/grpo_reward_func/mean": 0.5004688501358032,
|
|
"rewards/grpo_reward_func/std": 0.17277836799621582,
|
|
"step": 375
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.5625,
|
|
"kl": 0.00043654504406731576,
|
|
"learning_rate": 2.0833333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9480727.0,
|
|
"reward": 0.3416731357574463,
|
|
"reward_std": 0.08171491324901581,
|
|
"rewards/grpo_reward_func/mean": 0.3416731357574463,
|
|
"rewards/grpo_reward_func/std": 0.08727457374334335,
|
|
"step": 376
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.890625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.125,
|
|
"kl": 0.0005711590347345918,
|
|
"learning_rate": 2.0740740740740738e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9506119.0,
|
|
"reward": 0.2576856315135956,
|
|
"reward_std": 0.07897262275218964,
|
|
"rewards/grpo_reward_func/mean": 0.2576856315135956,
|
|
"rewards/grpo_reward_func/std": 0.1244540736079216,
|
|
"step": 377
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.90625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.875,
|
|
"kl": 0.0010556740162428468,
|
|
"learning_rate": 2.0648148148148148e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9531303.0,
|
|
"reward": 0.34753307700157166,
|
|
"reward_std": 0.18784627318382263,
|
|
"rewards/grpo_reward_func/mean": 0.34753307700157166,
|
|
"rewards/grpo_reward_func/std": 0.25699713826179504,
|
|
"step": 378
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.921875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.125,
|
|
"kl": 0.001215081021655351,
|
|
"learning_rate": 2.0555555555555553e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9556359.0,
|
|
"reward": 0.3578820824623108,
|
|
"reward_std": 0.16381356120109558,
|
|
"rewards/grpo_reward_func/mean": 0.3578820824623108,
|
|
"rewards/grpo_reward_func/std": 0.1917094886302948,
|
|
"step": 379
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.9375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.625,
|
|
"kl": 0.000709039144567214,
|
|
"learning_rate": 2.0462962962962964e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9581671.0,
|
|
"reward": 0.23046386241912842,
|
|
"reward_std": 0.15167269110679626,
|
|
"rewards/grpo_reward_func/mean": 0.23046386241912842,
|
|
"rewards/grpo_reward_func/std": 0.17622780799865723,
|
|
"step": 380
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.953125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.1875,
|
|
"kl": 0.0006106219661887735,
|
|
"learning_rate": 2.0370370370370369e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9606903.0,
|
|
"reward": 0.2709818184375763,
|
|
"reward_std": 0.12175773829221725,
|
|
"rewards/grpo_reward_func/mean": 0.2709818184375763,
|
|
"rewards/grpo_reward_func/std": 0.2393583059310913,
|
|
"step": 381
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.96875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.25,
|
|
"kl": 0.0006747972074663267,
|
|
"learning_rate": 2.0277777777777776e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9632351.0,
|
|
"reward": 0.1715637445449829,
|
|
"reward_std": 0.07509627938270569,
|
|
"rewards/grpo_reward_func/mean": 0.1715637445449829,
|
|
"rewards/grpo_reward_func/std": 0.12089093774557114,
|
|
"step": 382
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 5.984375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.375,
|
|
"kl": 0.0003686649724841118,
|
|
"learning_rate": 2.0185185185185187e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9657423.0,
|
|
"reward": 0.25206711888313293,
|
|
"reward_std": 0.11033091694116592,
|
|
"rewards/grpo_reward_func/mean": 0.25206711888313293,
|
|
"rewards/grpo_reward_func/std": 0.1744239181280136,
|
|
"step": 383
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.0,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.3125,
|
|
"kl": 0.0015397807583212852,
|
|
"learning_rate": 2.0092592592592591e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 9682895.0,
|
|
"reward": 0.22812840342521667,
|
|
"reward_std": 0.16441597044467926,
|
|
"rewards/grpo_reward_func/mean": 0.22812840342521667,
|
|
"rewards/grpo_reward_func/std": 0.2453615814447403,
|
|
"step": 384
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.015625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.625,
|
|
"kl": 0.0013099961797706783,
|
|
"learning_rate": 2e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 9707727.0,
|
|
"reward": 0.4902651309967041,
|
|
"reward_std": 0.12934622168540955,
|
|
"rewards/grpo_reward_func/mean": 0.4902651309967041,
|
|
"rewards/grpo_reward_func/std": 0.12886442244052887,
|
|
"step": 385
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.03125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.53125,
|
|
"kl": 0.0006103202176745981,
|
|
"learning_rate": 1.9907407407407407e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9732983.0,
|
|
"reward": 0.20654284954071045,
|
|
"reward_std": 0.07593633234500885,
|
|
"rewards/grpo_reward_func/mean": 0.20654284954071045,
|
|
"rewards/grpo_reward_func/std": 0.11230036616325378,
|
|
"step": 386
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.046875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.59375,
|
|
"kl": 0.0013176609645597637,
|
|
"learning_rate": 1.9814814814814814e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 9758559.0,
|
|
"reward": 0.2913218140602112,
|
|
"reward_std": 0.07996957749128342,
|
|
"rewards/grpo_reward_func/mean": 0.2913218140602112,
|
|
"rewards/grpo_reward_func/std": 0.20861035585403442,
|
|
"step": 387
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.0625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.9375,
|
|
"kl": 0.00296200011507608,
|
|
"learning_rate": 1.9722222222222222e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 9784007.0,
|
|
"reward": 0.3508840501308441,
|
|
"reward_std": 0.15029387176036835,
|
|
"rewards/grpo_reward_func/mean": 0.3508840501308441,
|
|
"rewards/grpo_reward_func/std": 0.1881677657365799,
|
|
"step": 388
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.078125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.375,
|
|
"kl": 0.0007045343518257141,
|
|
"learning_rate": 1.962962962962963e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9809183.0,
|
|
"reward": 0.38261544704437256,
|
|
"reward_std": 0.13104796409606934,
|
|
"rewards/grpo_reward_func/mean": 0.38261544704437256,
|
|
"rewards/grpo_reward_func/std": 0.16021603345870972,
|
|
"step": 389
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.09375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.5625,
|
|
"kl": 0.0007661972194910049,
|
|
"learning_rate": 1.9537037037037034e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9834375.0,
|
|
"reward": 0.2729978561401367,
|
|
"reward_std": 0.14411945641040802,
|
|
"rewards/grpo_reward_func/mean": 0.2729978561401367,
|
|
"rewards/grpo_reward_func/std": 0.18588939309120178,
|
|
"step": 390
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.109375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.375,
|
|
"kl": 0.000638260506093502,
|
|
"learning_rate": 1.9444444444444445e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9860271.0,
|
|
"reward": 0.14101102948188782,
|
|
"reward_std": 0.07042165100574493,
|
|
"rewards/grpo_reward_func/mean": 0.14101102948188782,
|
|
"rewards/grpo_reward_func/std": 0.08415806293487549,
|
|
"step": 391
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.0625,
|
|
"kl": 0.0011364755337126553,
|
|
"learning_rate": 1.935185185185185e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9884943.0,
|
|
"reward": 0.4777190387248993,
|
|
"reward_std": 0.09997393190860748,
|
|
"rewards/grpo_reward_func/mean": 0.4777190387248993,
|
|
"rewards/grpo_reward_func/std": 0.10503428429365158,
|
|
"step": 392
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.140625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.875,
|
|
"kl": 0.0010265009186696261,
|
|
"learning_rate": 1.9259259259259257e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9910519.0,
|
|
"reward": 0.09008777141571045,
|
|
"reward_std": 0.1275341957807541,
|
|
"rewards/grpo_reward_func/mean": 0.09008777141571045,
|
|
"rewards/grpo_reward_func/std": 0.14905866980552673,
|
|
"step": 393
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.15625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.5625,
|
|
"kl": 0.0010667061724234372,
|
|
"learning_rate": 1.9166666666666668e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9935623.0,
|
|
"reward": 0.3963939845561981,
|
|
"reward_std": 0.1269564926624298,
|
|
"rewards/grpo_reward_func/mean": 0.3963939845561981,
|
|
"rewards/grpo_reward_func/std": 0.12265895307064056,
|
|
"step": 394
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.171875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.8125,
|
|
"kl": 0.0010338767024222761,
|
|
"learning_rate": 1.9074074074074073e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9960807.0,
|
|
"reward": 0.339819073677063,
|
|
"reward_std": 0.131906658411026,
|
|
"rewards/grpo_reward_func/mean": 0.339819073677063,
|
|
"rewards/grpo_reward_func/std": 0.19305965304374695,
|
|
"step": 395
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.1875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.625,
|
|
"kl": 0.001020548545056954,
|
|
"learning_rate": 1.8981481481481483e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9986367.0,
|
|
"reward": 0.15465489029884338,
|
|
"reward_std": 0.1416236013174057,
|
|
"rewards/grpo_reward_func/mean": 0.15465489029884338,
|
|
"rewards/grpo_reward_func/std": 0.1588820368051529,
|
|
"step": 396
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.203125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.6875,
|
|
"kl": 0.0015650332206860185,
|
|
"learning_rate": 1.8888888888888888e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 10011895.0,
|
|
"reward": 0.16186270117759705,
|
|
"reward_std": 0.07497712969779968,
|
|
"rewards/grpo_reward_func/mean": 0.16186270117759705,
|
|
"rewards/grpo_reward_func/std": 0.16643059253692627,
|
|
"step": 397
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.21875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.875,
|
|
"kl": 0.0011266165529377759,
|
|
"learning_rate": 1.8796296296296295e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10036927.0,
|
|
"reward": 0.33959537744522095,
|
|
"reward_std": 0.14239588379859924,
|
|
"rewards/grpo_reward_func/mean": 0.33959537744522095,
|
|
"rewards/grpo_reward_func/std": 0.18363837897777557,
|
|
"step": 398
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.234375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.25,
|
|
"kl": 0.00058710016310215,
|
|
"learning_rate": 1.8703703703703703e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10062167.0,
|
|
"reward": 0.18064472079277039,
|
|
"reward_std": 0.10977937281131744,
|
|
"rewards/grpo_reward_func/mean": 0.18064472079277039,
|
|
"rewards/grpo_reward_func/std": 0.12083122134208679,
|
|
"step": 399
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.25,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.6875,
|
|
"kl": 0.0005866416468052194,
|
|
"learning_rate": 1.861111111111111e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10088111.0,
|
|
"reward": 0.1212317943572998,
|
|
"reward_std": 0.11557292938232422,
|
|
"rewards/grpo_reward_func/mean": 0.1212317943572998,
|
|
"rewards/grpo_reward_func/std": 0.13769572973251343,
|
|
"step": 400
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.265625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.1875,
|
|
"kl": 0.0011352576548233628,
|
|
"learning_rate": 1.8518518518518516e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10113127.0,
|
|
"reward": 0.3895754814147949,
|
|
"reward_std": 0.13341936469078064,
|
|
"rewards/grpo_reward_func/mean": 0.3895754814147949,
|
|
"rewards/grpo_reward_func/std": 0.1501649022102356,
|
|
"step": 401
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.28125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.28125,
|
|
"kl": 0.0009206359682139009,
|
|
"learning_rate": 1.8425925925925926e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10138511.0,
|
|
"reward": 0.29999056458473206,
|
|
"reward_std": 0.08621784299612045,
|
|
"rewards/grpo_reward_func/mean": 0.29999056458473206,
|
|
"rewards/grpo_reward_func/std": 0.24252568185329437,
|
|
"step": 402
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.296875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.0625,
|
|
"kl": 0.0011214362166356295,
|
|
"learning_rate": 1.833333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10164063.0,
|
|
"reward": 0.25143373012542725,
|
|
"reward_std": 0.09383880347013474,
|
|
"rewards/grpo_reward_func/mean": 0.25143373012542725,
|
|
"rewards/grpo_reward_func/std": 0.13475444912910461,
|
|
"step": 403
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.3125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.625,
|
|
"kl": 0.0012225185928400606,
|
|
"learning_rate": 1.824074074074074e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10188807.0,
|
|
"reward": 0.3598247468471527,
|
|
"reward_std": 0.1472463756799698,
|
|
"rewards/grpo_reward_func/mean": 0.3598247468471527,
|
|
"rewards/grpo_reward_func/std": 0.15841133892536163,
|
|
"step": 404
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.328125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.4375,
|
|
"kl": 0.0006255771440919489,
|
|
"learning_rate": 1.8148148148148149e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10214127.0,
|
|
"reward": 0.1664871722459793,
|
|
"reward_std": 0.12487848103046417,
|
|
"rewards/grpo_reward_func/mean": 0.1664871722459793,
|
|
"rewards/grpo_reward_func/std": 0.22237654030323029,
|
|
"step": 405
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.34375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.0625,
|
|
"kl": 0.0012482001329772174,
|
|
"learning_rate": 1.8055555555555554e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 10238927.0,
|
|
"reward": 0.48037129640579224,
|
|
"reward_std": 0.12004883587360382,
|
|
"rewards/grpo_reward_func/mean": 0.48037129640579224,
|
|
"rewards/grpo_reward_func/std": 0.17319442331790924,
|
|
"step": 406
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.359375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.5,
|
|
"kl": 0.0008822223462630063,
|
|
"learning_rate": 1.7962962962962964e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10264383.0,
|
|
"reward": 0.24905702471733093,
|
|
"reward_std": 0.09374159574508667,
|
|
"rewards/grpo_reward_func/mean": 0.24905702471733093,
|
|
"rewards/grpo_reward_func/std": 0.20010940730571747,
|
|
"step": 407
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.0,
|
|
"kl": 0.001847555220592767,
|
|
"learning_rate": 1.787037037037037e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 10289287.0,
|
|
"reward": 0.3152793049812317,
|
|
"reward_std": 0.08679287880659103,
|
|
"rewards/grpo_reward_func/mean": 0.3152793049812317,
|
|
"rewards/grpo_reward_func/std": 0.09012952446937561,
|
|
"step": 408
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.390625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.875,
|
|
"kl": 0.0023550866171717644,
|
|
"learning_rate": 1.7777777777777776e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 10314655.0,
|
|
"reward": 0.2986000180244446,
|
|
"reward_std": 0.16402404010295868,
|
|
"rewards/grpo_reward_func/mean": 0.2986000180244446,
|
|
"rewards/grpo_reward_func/std": 0.18015649914741516,
|
|
"step": 409
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.40625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.5,
|
|
"kl": 0.0007087271515047178,
|
|
"learning_rate": 1.7685185185185184e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10339839.0,
|
|
"reward": 0.22273430228233337,
|
|
"reward_std": 0.11841462552547455,
|
|
"rewards/grpo_reward_func/mean": 0.22273430228233337,
|
|
"rewards/grpo_reward_func/std": 0.13113521039485931,
|
|
"step": 410
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.421875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.25,
|
|
"kl": 0.0006067858485039324,
|
|
"learning_rate": 1.7592592592592592e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10364735.0,
|
|
"reward": 0.28128859400749207,
|
|
"reward_std": 0.1168629378080368,
|
|
"rewards/grpo_reward_func/mean": 0.28128859400749207,
|
|
"rewards/grpo_reward_func/std": 0.23241311311721802,
|
|
"step": 411
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.4375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.6875,
|
|
"kl": 0.0004597442748490721,
|
|
"learning_rate": 1.75e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10389631.0,
|
|
"reward": 0.3766486644744873,
|
|
"reward_std": 0.08330925554037094,
|
|
"rewards/grpo_reward_func/mean": 0.3766486644744873,
|
|
"rewards/grpo_reward_func/std": 0.09394894540309906,
|
|
"step": 412
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.453125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.59375,
|
|
"kl": 0.0014080870896577835,
|
|
"learning_rate": 1.7407407407407407e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 10415031.0,
|
|
"reward": 0.14589783549308777,
|
|
"reward_std": 0.1070764809846878,
|
|
"rewards/grpo_reward_func/mean": 0.14589783549308777,
|
|
"rewards/grpo_reward_func/std": 0.18981052935123444,
|
|
"step": 413
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.46875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.625,
|
|
"kl": 0.001625294506084174,
|
|
"learning_rate": 1.7314814814814812e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 10440183.0,
|
|
"reward": 0.30828016996383667,
|
|
"reward_std": 0.13814100623130798,
|
|
"rewards/grpo_reward_func/mean": 0.30828016996383667,
|
|
"rewards/grpo_reward_func/std": 0.1791732907295227,
|
|
"step": 414
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.484375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.875,
|
|
"kl": 0.0011470156605355442,
|
|
"learning_rate": 1.7222222222222222e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10465111.0,
|
|
"reward": 0.555537760257721,
|
|
"reward_std": 0.13992036879062653,
|
|
"rewards/grpo_reward_func/mean": 0.555537760257721,
|
|
"rewards/grpo_reward_func/std": 0.1409028172492981,
|
|
"step": 415
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.5,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.0,
|
|
"kl": 0.0006981039041420445,
|
|
"learning_rate": 1.712962962962963e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10490231.0,
|
|
"reward": 0.2912987172603607,
|
|
"reward_std": 0.06564676761627197,
|
|
"rewards/grpo_reward_func/mean": 0.2912987172603607,
|
|
"rewards/grpo_reward_func/std": 0.09684650599956512,
|
|
"step": 416
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.515625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.5,
|
|
"kl": 0.001724751084111631,
|
|
"learning_rate": 1.7037037037037035e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 10515439.0,
|
|
"reward": 0.2970144748687744,
|
|
"reward_std": 0.09115941822528839,
|
|
"rewards/grpo_reward_func/mean": 0.2970144748687744,
|
|
"rewards/grpo_reward_func/std": 0.1823878437280655,
|
|
"step": 417
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.53125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.375,
|
|
"kl": 0.0006452612578868866,
|
|
"learning_rate": 1.6944444444444445e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10540311.0,
|
|
"reward": 0.2946871221065521,
|
|
"reward_std": 0.07320266216993332,
|
|
"rewards/grpo_reward_func/mean": 0.2946871221065521,
|
|
"rewards/grpo_reward_func/std": 0.10113289952278137,
|
|
"step": 418
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.546875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.3125,
|
|
"kl": 0.0011799820058513433,
|
|
"learning_rate": 1.685185185185185e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10565607.0,
|
|
"reward": 0.36064761877059937,
|
|
"reward_std": 0.12253247946500778,
|
|
"rewards/grpo_reward_func/mean": 0.36064761877059937,
|
|
"rewards/grpo_reward_func/std": 0.1580498218536377,
|
|
"step": 419
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.5625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.375,
|
|
"kl": 0.0006311101315077394,
|
|
"learning_rate": 1.675925925925926e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10590943.0,
|
|
"reward": 0.255226194858551,
|
|
"reward_std": 0.1812346875667572,
|
|
"rewards/grpo_reward_func/mean": 0.255226194858551,
|
|
"rewards/grpo_reward_func/std": 0.21465614438056946,
|
|
"step": 420
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.578125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.3125,
|
|
"kl": 0.0003934912383556366,
|
|
"learning_rate": 1.6666666666666665e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10616319.0,
|
|
"reward": 0.23169803619384766,
|
|
"reward_std": 0.11887459456920624,
|
|
"rewards/grpo_reward_func/mean": 0.23169803619384766,
|
|
"rewards/grpo_reward_func/std": 0.15518471598625183,
|
|
"step": 421
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.59375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.75,
|
|
"kl": 0.0007099388167262077,
|
|
"learning_rate": 1.6574074074074073e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10642207.0,
|
|
"reward": 0.061446841806173325,
|
|
"reward_std": 0.14731059968471527,
|
|
"rewards/grpo_reward_func/mean": 0.061446841806173325,
|
|
"rewards/grpo_reward_func/std": 0.14346599578857422,
|
|
"step": 422
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.609375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.5,
|
|
"kl": 0.0018291054293513298,
|
|
"learning_rate": 1.648148148148148e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 10667199.0,
|
|
"reward": 0.2039181888103485,
|
|
"reward_std": 0.17049936950206757,
|
|
"rewards/grpo_reward_func/mean": 0.2039181888103485,
|
|
"rewards/grpo_reward_func/std": 0.20998157560825348,
|
|
"step": 423
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.375,
|
|
"kl": 0.0011474412167444825,
|
|
"learning_rate": 1.6388888888888888e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10692487.0,
|
|
"reward": 0.26651886105537415,
|
|
"reward_std": 0.10566580295562744,
|
|
"rewards/grpo_reward_func/mean": 0.26651886105537415,
|
|
"rewards/grpo_reward_func/std": 0.23722681403160095,
|
|
"step": 424
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.640625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.5,
|
|
"kl": 0.0013400282186921686,
|
|
"learning_rate": 1.6296296296296298e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 10717431.0,
|
|
"reward": 0.4234394431114197,
|
|
"reward_std": 0.11313052475452423,
|
|
"rewards/grpo_reward_func/mean": 0.4234394431114197,
|
|
"rewards/grpo_reward_func/std": 0.15678033232688904,
|
|
"step": 425
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.65625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.4375,
|
|
"kl": 0.0011549523624125868,
|
|
"learning_rate": 1.6203703703703703e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10743015.0,
|
|
"reward": 0.20128034055233002,
|
|
"reward_std": 0.09024003148078918,
|
|
"rewards/grpo_reward_func/mean": 0.20128034055233002,
|
|
"rewards/grpo_reward_func/std": 0.19719721376895905,
|
|
"step": 426
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.671875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.90625,
|
|
"kl": 0.0007055181486066431,
|
|
"learning_rate": 1.611111111111111e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10768159.0,
|
|
"reward": 0.26648253202438354,
|
|
"reward_std": 0.04378199204802513,
|
|
"rewards/grpo_reward_func/mean": 0.26648253202438354,
|
|
"rewards/grpo_reward_func/std": 0.05463617295026779,
|
|
"step": 427
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.6875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.0,
|
|
"kl": 0.0019033817807212472,
|
|
"learning_rate": 1.6018518518518518e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 10793215.0,
|
|
"reward": 0.3865242600440979,
|
|
"reward_std": 0.12939924001693726,
|
|
"rewards/grpo_reward_func/mean": 0.3865242600440979,
|
|
"rewards/grpo_reward_func/std": 0.13254022598266602,
|
|
"step": 428
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.703125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.6875,
|
|
"kl": 0.0012472507660277188,
|
|
"learning_rate": 1.5925925925925926e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10818415.0,
|
|
"reward": 0.21385148167610168,
|
|
"reward_std": 0.1188204437494278,
|
|
"rewards/grpo_reward_func/mean": 0.21385148167610168,
|
|
"rewards/grpo_reward_func/std": 0.19592618942260742,
|
|
"step": 429
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.71875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.9375,
|
|
"kl": 0.0010245550947729498,
|
|
"learning_rate": 1.583333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10843663.0,
|
|
"reward": 0.2595471739768982,
|
|
"reward_std": 0.127162903547287,
|
|
"rewards/grpo_reward_func/mean": 0.2595471739768982,
|
|
"rewards/grpo_reward_func/std": 0.1796990931034088,
|
|
"step": 430
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.734375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.0625,
|
|
"kl": 0.001687789976131171,
|
|
"learning_rate": 1.574074074074074e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 10869183.0,
|
|
"reward": 0.1994100958108902,
|
|
"reward_std": 0.09664750844240189,
|
|
"rewards/grpo_reward_func/mean": 0.1994100958108902,
|
|
"rewards/grpo_reward_func/std": 0.19897720217704773,
|
|
"step": 431
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.75,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.4375,
|
|
"kl": 0.0014949360047467053,
|
|
"learning_rate": 1.5648148148148146e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 10894479.0,
|
|
"reward": 0.24675852060317993,
|
|
"reward_std": 0.10688143223524094,
|
|
"rewards/grpo_reward_func/mean": 0.24675852060317993,
|
|
"rewards/grpo_reward_func/std": 0.12910714745521545,
|
|
"step": 432
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.765625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.125,
|
|
"kl": 0.0012780396500602365,
|
|
"learning_rate": 1.5555555555555556e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 10919831.0,
|
|
"reward": 0.2442871779203415,
|
|
"reward_std": 0.10166030377149582,
|
|
"rewards/grpo_reward_func/mean": 0.2442871779203415,
|
|
"rewards/grpo_reward_func/std": 0.18765191733837128,
|
|
"step": 433
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.78125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.375,
|
|
"kl": 0.0011734996805898845,
|
|
"learning_rate": 1.546296296296296e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10945471.0,
|
|
"reward": 0.21862854063510895,
|
|
"reward_std": 0.09884025901556015,
|
|
"rewards/grpo_reward_func/mean": 0.21862854063510895,
|
|
"rewards/grpo_reward_func/std": 0.2449023574590683,
|
|
"step": 434
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.796875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.0,
|
|
"kl": 0.0006825063901487738,
|
|
"learning_rate": 1.537037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10970375.0,
|
|
"reward": 0.3288189470767975,
|
|
"reward_std": 0.1153157502412796,
|
|
"rewards/grpo_reward_func/mean": 0.3288189470767975,
|
|
"rewards/grpo_reward_func/std": 0.12331512570381165,
|
|
"step": 435
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.8125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.9375,
|
|
"kl": 0.0016119840438477695,
|
|
"learning_rate": 1.527777777777778e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 10995247.0,
|
|
"reward": 0.43004822731018066,
|
|
"reward_std": 0.14090153574943542,
|
|
"rewards/grpo_reward_func/mean": 0.43004822731018066,
|
|
"rewards/grpo_reward_func/std": 0.15082739293575287,
|
|
"step": 436
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.828125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.75,
|
|
"kl": 0.001201577513711527,
|
|
"learning_rate": 1.5185185185185184e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11020023.0,
|
|
"reward": 0.4190066158771515,
|
|
"reward_std": 0.11930587887763977,
|
|
"rewards/grpo_reward_func/mean": 0.4190066158771515,
|
|
"rewards/grpo_reward_func/std": 0.13784445822238922,
|
|
"step": 437
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.84375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.5625,
|
|
"kl": 0.00105038468609564,
|
|
"learning_rate": 1.5092592592592592e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11045703.0,
|
|
"reward": 0.15409120917320251,
|
|
"reward_std": 0.0602850615978241,
|
|
"rewards/grpo_reward_func/mean": 0.15409120917320251,
|
|
"rewards/grpo_reward_func/std": 0.09643904119729996,
|
|
"step": 438
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.859375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.5625,
|
|
"kl": 0.0010647746094036847,
|
|
"learning_rate": 1.5e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11070775.0,
|
|
"reward": 0.23255158960819244,
|
|
"reward_std": 0.121828094124794,
|
|
"rewards/grpo_reward_func/mean": 0.23255158960819244,
|
|
"rewards/grpo_reward_func/std": 0.18913201987743378,
|
|
"step": 439
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.625,
|
|
"kl": 0.0009447056509088725,
|
|
"learning_rate": 1.4907407407407407e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11096191.0,
|
|
"reward": 0.2456386387348175,
|
|
"reward_std": 0.08084774017333984,
|
|
"rewards/grpo_reward_func/mean": 0.2456386387348175,
|
|
"rewards/grpo_reward_func/std": 0.189274862408638,
|
|
"step": 440
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.890625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.875,
|
|
"kl": 0.0006106716318754479,
|
|
"learning_rate": 1.4814814814814815e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11121111.0,
|
|
"reward": 0.26635077595710754,
|
|
"reward_std": 0.11237984895706177,
|
|
"rewards/grpo_reward_func/mean": 0.26635077595710754,
|
|
"rewards/grpo_reward_func/std": 0.14029628038406372,
|
|
"step": 441
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.90625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.9375,
|
|
"kl": 0.000595945239183493,
|
|
"learning_rate": 1.4722222222222222e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11146031.0,
|
|
"reward": 0.3882697820663452,
|
|
"reward_std": 0.11520197987556458,
|
|
"rewards/grpo_reward_func/mean": 0.3882697820663452,
|
|
"rewards/grpo_reward_func/std": 0.15360315144062042,
|
|
"step": 442
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.921875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.9375,
|
|
"kl": 0.0020495177595876157,
|
|
"learning_rate": 1.4629629629629627e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 11170975.0,
|
|
"reward": 0.3387864828109741,
|
|
"reward_std": 0.07305868715047836,
|
|
"rewards/grpo_reward_func/mean": 0.3387864828109741,
|
|
"rewards/grpo_reward_func/std": 0.1130770593881607,
|
|
"step": 443
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.9375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.9375,
|
|
"kl": 0.0012185790692456067,
|
|
"learning_rate": 1.4537037037037037e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11195919.0,
|
|
"reward": 0.3070831000804901,
|
|
"reward_std": 0.08975166827440262,
|
|
"rewards/grpo_reward_func/mean": 0.3070831000804901,
|
|
"rewards/grpo_reward_func/std": 0.0882553681731224,
|
|
"step": 444
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.953125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 18.0,
|
|
"kl": 0.002497857822163496,
|
|
"learning_rate": 1.4444444444444442e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 11221399.0,
|
|
"reward": 0.3126865029335022,
|
|
"reward_std": 0.13266904652118683,
|
|
"rewards/grpo_reward_func/mean": 0.3126865029335022,
|
|
"rewards/grpo_reward_func/std": 0.14260481297969818,
|
|
"step": 445
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.96875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 18.5,
|
|
"kl": 0.00145068543497473,
|
|
"learning_rate": 1.435185185185185e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 11246231.0,
|
|
"reward": 0.36069953441619873,
|
|
"reward_std": 0.10953323543071747,
|
|
"rewards/grpo_reward_func/mean": 0.36069953441619873,
|
|
"rewards/grpo_reward_func/std": 0.11150137335062027,
|
|
"step": 446
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 6.984375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.5625,
|
|
"kl": 0.0013835610006935894,
|
|
"learning_rate": 1.425925925925926e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 11271471.0,
|
|
"reward": 0.3240332305431366,
|
|
"reward_std": 0.09530281275510788,
|
|
"rewards/grpo_reward_func/mean": 0.3240332305431366,
|
|
"rewards/grpo_reward_func/std": 0.17097727954387665,
|
|
"step": 447
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.0,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.0,
|
|
"kl": 0.0013482021167874336,
|
|
"learning_rate": 1.4166666666666665e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 11296711.0,
|
|
"reward": 0.309320330619812,
|
|
"reward_std": 0.0705028623342514,
|
|
"rewards/grpo_reward_func/mean": 0.309320330619812,
|
|
"rewards/grpo_reward_func/std": 0.2660787105560303,
|
|
"step": 448
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.015625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.625,
|
|
"kl": 0.0004683341830968857,
|
|
"learning_rate": 1.4074074074074075e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11322287.0,
|
|
"reward": 0.19587868452072144,
|
|
"reward_std": 0.08648732304573059,
|
|
"rewards/grpo_reward_func/mean": 0.19587868452072144,
|
|
"rewards/grpo_reward_func/std": 0.09603223204612732,
|
|
"step": 449
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.03125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.3125,
|
|
"kl": 0.0008416750060860068,
|
|
"learning_rate": 1.398148148148148e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11347087.0,
|
|
"reward": 0.46222931146621704,
|
|
"reward_std": 0.10860372334718704,
|
|
"rewards/grpo_reward_func/mean": 0.46222931146621704,
|
|
"rewards/grpo_reward_func/std": 0.15214873850345612,
|
|
"step": 450
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.046875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.25,
|
|
"kl": 0.0014006058045197278,
|
|
"learning_rate": 1.3888888888888888e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 11371863.0,
|
|
"reward": 0.5111293792724609,
|
|
"reward_std": 0.08110877871513367,
|
|
"rewards/grpo_reward_func/mean": 0.5111293792724609,
|
|
"rewards/grpo_reward_func/std": 0.09304346144199371,
|
|
"step": 451
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.0625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.25,
|
|
"kl": 0.0011988499463768676,
|
|
"learning_rate": 1.3796296296296296e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11396831.0,
|
|
"reward": 0.4040873050689697,
|
|
"reward_std": 0.09579525142908096,
|
|
"rewards/grpo_reward_func/mean": 0.4040873050689697,
|
|
"rewards/grpo_reward_func/std": 0.09277461469173431,
|
|
"step": 452
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.078125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.6875,
|
|
"kl": 0.0012973888660781085,
|
|
"learning_rate": 1.3703703703703703e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 11421887.0,
|
|
"reward": 0.33569732308387756,
|
|
"reward_std": 0.06503790616989136,
|
|
"rewards/grpo_reward_func/mean": 0.33569732308387756,
|
|
"rewards/grpo_reward_func/std": 0.07260998338460922,
|
|
"step": 453
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.09375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.1875,
|
|
"kl": 0.0012591605191119015,
|
|
"learning_rate": 1.3611111111111108e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 11446743.0,
|
|
"reward": 0.46880820393562317,
|
|
"reward_std": 0.12013055384159088,
|
|
"rewards/grpo_reward_func/mean": 0.46880820393562317,
|
|
"rewards/grpo_reward_func/std": 0.1322290599346161,
|
|
"step": 454
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.109375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.125,
|
|
"kl": 0.0012072750396328047,
|
|
"learning_rate": 1.3518518518518518e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11472823.0,
|
|
"reward": 0.0363832488656044,
|
|
"reward_std": 0.07413066923618317,
|
|
"rewards/grpo_reward_func/mean": 0.0363832488656044,
|
|
"rewards/grpo_reward_func/std": 0.10118604451417923,
|
|
"step": 455
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.125,
|
|
"kl": 0.0018538168515078723,
|
|
"learning_rate": 1.3425925925925926e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 11498455.0,
|
|
"reward": 0.1913381814956665,
|
|
"reward_std": 0.0929805338382721,
|
|
"rewards/grpo_reward_func/mean": 0.1913381814956665,
|
|
"rewards/grpo_reward_func/std": 0.16615688800811768,
|
|
"step": 456
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.140625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.9375,
|
|
"kl": 0.0006855083629488945,
|
|
"learning_rate": 1.3333333333333334e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11523255.0,
|
|
"reward": 0.27506938576698303,
|
|
"reward_std": 0.10065136849880219,
|
|
"rewards/grpo_reward_func/mean": 0.27506938576698303,
|
|
"rewards/grpo_reward_func/std": 0.19226348400115967,
|
|
"step": 457
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.15625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.4375,
|
|
"kl": 0.00046966194349806756,
|
|
"learning_rate": 1.324074074074074e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11548327.0,
|
|
"reward": 0.25183096528053284,
|
|
"reward_std": 0.10666584968566895,
|
|
"rewards/grpo_reward_func/mean": 0.25183096528053284,
|
|
"rewards/grpo_reward_func/std": 0.1858406513929367,
|
|
"step": 458
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.171875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.4375,
|
|
"kl": 0.0009555828873999417,
|
|
"learning_rate": 1.3148148148148146e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11573839.0,
|
|
"reward": 0.2436429113149643,
|
|
"reward_std": 0.1274951547384262,
|
|
"rewards/grpo_reward_func/mean": 0.2436429113149643,
|
|
"rewards/grpo_reward_func/std": 0.19650402665138245,
|
|
"step": 459
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.1875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.4375,
|
|
"kl": 0.001801790960598737,
|
|
"learning_rate": 1.3055555555555556e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 11598935.0,
|
|
"reward": 0.3082733154296875,
|
|
"reward_std": 0.2006826102733612,
|
|
"rewards/grpo_reward_func/mean": 0.3082733154296875,
|
|
"rewards/grpo_reward_func/std": 0.24268022179603577,
|
|
"step": 460
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.203125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.5,
|
|
"kl": 0.0008424867992289364,
|
|
"learning_rate": 1.2962962962962961e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11623903.0,
|
|
"reward": 0.36446842551231384,
|
|
"reward_std": 0.10075566172599792,
|
|
"rewards/grpo_reward_func/mean": 0.36446842551231384,
|
|
"rewards/grpo_reward_func/std": 0.14413763582706451,
|
|
"step": 461
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.21875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.3125,
|
|
"kl": 0.00168600061442703,
|
|
"learning_rate": 1.287037037037037e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 11648591.0,
|
|
"reward": 0.34162139892578125,
|
|
"reward_std": 0.07324203103780746,
|
|
"rewards/grpo_reward_func/mean": 0.34162139892578125,
|
|
"rewards/grpo_reward_func/std": 0.09398888051509857,
|
|
"step": 462
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.234375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.0625,
|
|
"kl": 0.0011306122469250113,
|
|
"learning_rate": 1.2777777777777777e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11674071.0,
|
|
"reward": 0.09499461203813553,
|
|
"reward_std": 0.08520884811878204,
|
|
"rewards/grpo_reward_func/mean": 0.09499461203813553,
|
|
"rewards/grpo_reward_func/std": 0.10134407877922058,
|
|
"step": 463
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.25,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.8125,
|
|
"kl": 0.0006733744667144492,
|
|
"learning_rate": 1.2685185185185184e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11699335.0,
|
|
"reward": 0.2652754783630371,
|
|
"reward_std": 0.09839694201946259,
|
|
"rewards/grpo_reward_func/mean": 0.2652754783630371,
|
|
"rewards/grpo_reward_func/std": 0.10382307320833206,
|
|
"step": 464
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.265625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.0625,
|
|
"kl": 0.0005735903978347778,
|
|
"learning_rate": 1.2592592592592592e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11725127.0,
|
|
"reward": 0.12507686018943787,
|
|
"reward_std": 0.14018288254737854,
|
|
"rewards/grpo_reward_func/mean": 0.12507686018943787,
|
|
"rewards/grpo_reward_func/std": 0.15515510737895966,
|
|
"step": 465
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.28125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.3125,
|
|
"kl": 0.002856824896298349,
|
|
"learning_rate": 1.25e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 11750423.0,
|
|
"reward": 0.2550414800643921,
|
|
"reward_std": 0.10255648195743561,
|
|
"rewards/grpo_reward_func/mean": 0.2550414800643921,
|
|
"rewards/grpo_reward_func/std": 0.18651345372200012,
|
|
"step": 466
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.296875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.25,
|
|
"kl": 0.000869341180077754,
|
|
"learning_rate": 1.2407407407407407e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11776615.0,
|
|
"reward": 0.18968135118484497,
|
|
"reward_std": 0.12408946454524994,
|
|
"rewards/grpo_reward_func/mean": 0.18968135118484497,
|
|
"rewards/grpo_reward_func/std": 0.2001882642507553,
|
|
"step": 467
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.3125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.6875,
|
|
"kl": 0.0007764963957015425,
|
|
"learning_rate": 1.2314814814814815e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11802007.0,
|
|
"reward": 0.28070777654647827,
|
|
"reward_std": 0.07643483579158783,
|
|
"rewards/grpo_reward_func/mean": 0.28070777654647827,
|
|
"rewards/grpo_reward_func/std": 0.08400604128837585,
|
|
"step": 468
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.328125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.1875,
|
|
"kl": 0.000670313835144043,
|
|
"learning_rate": 1.2222222222222222e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11827703.0,
|
|
"reward": 0.09706881642341614,
|
|
"reward_std": 0.062192559242248535,
|
|
"rewards/grpo_reward_func/mean": 0.09706881642341614,
|
|
"rewards/grpo_reward_func/std": 0.06070980429649353,
|
|
"step": 469
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.34375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.625,
|
|
"kl": 0.0012332831101957709,
|
|
"learning_rate": 1.212962962962963e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11852679.0,
|
|
"reward": 0.29699277877807617,
|
|
"reward_std": 0.10623517632484436,
|
|
"rewards/grpo_reward_func/mean": 0.29699277877807617,
|
|
"rewards/grpo_reward_func/std": 0.14658339321613312,
|
|
"step": 470
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.359375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.71875,
|
|
"kl": 0.0010181590914726257,
|
|
"learning_rate": 1.2037037037037035e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11878375.0,
|
|
"reward": 0.21717457473278046,
|
|
"reward_std": 0.0667777881026268,
|
|
"rewards/grpo_reward_func/mean": 0.21717457473278046,
|
|
"rewards/grpo_reward_func/std": 0.1625964194536209,
|
|
"step": 471
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.8125,
|
|
"kl": 0.0016762353479862213,
|
|
"learning_rate": 1.1944444444444445e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 11903631.0,
|
|
"reward": 0.22018520534038544,
|
|
"reward_std": 0.10192622244358063,
|
|
"rewards/grpo_reward_func/mean": 0.22018520534038544,
|
|
"rewards/grpo_reward_func/std": 0.14623050391674042,
|
|
"step": 472
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.390625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.5,
|
|
"kl": 0.0004212940257275477,
|
|
"learning_rate": 1.1851851851851851e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11928839.0,
|
|
"reward": 0.21051539480686188,
|
|
"reward_std": 0.13067127764225006,
|
|
"rewards/grpo_reward_func/mean": 0.21051539480686188,
|
|
"rewards/grpo_reward_func/std": 0.21328914165496826,
|
|
"step": 473
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.40625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.625,
|
|
"kl": 0.0012048408098053187,
|
|
"learning_rate": 1.1759259259259259e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11954751.0,
|
|
"reward": 0.05355652794241905,
|
|
"reward_std": 0.10848183929920197,
|
|
"rewards/grpo_reward_func/mean": 0.05355652794241905,
|
|
"rewards/grpo_reward_func/std": 0.10706359893083572,
|
|
"step": 474
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.421875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.25,
|
|
"kl": 0.0014668001676909626,
|
|
"learning_rate": 1.1666666666666667e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 11979711.0,
|
|
"reward": 0.38055509328842163,
|
|
"reward_std": 0.097016841173172,
|
|
"rewards/grpo_reward_func/mean": 0.38055509328842163,
|
|
"rewards/grpo_reward_func/std": 0.17705973982810974,
|
|
"step": 475
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.4375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.1875,
|
|
"kl": 0.001055899978382513,
|
|
"learning_rate": 1.1574074074074074e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12005343.0,
|
|
"reward": 0.2753819525241852,
|
|
"reward_std": 0.10407428443431854,
|
|
"rewards/grpo_reward_func/mean": 0.2753819525241852,
|
|
"rewards/grpo_reward_func/std": 0.24588319659233093,
|
|
"step": 476
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.453125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.75,
|
|
"kl": 0.00115215900586918,
|
|
"learning_rate": 1.148148148148148e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12030439.0,
|
|
"reward": 0.31799519062042236,
|
|
"reward_std": 0.12696103751659393,
|
|
"rewards/grpo_reward_func/mean": 0.31799519062042236,
|
|
"rewards/grpo_reward_func/std": 0.14141467213630676,
|
|
"step": 477
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.46875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.0,
|
|
"kl": 0.0012049302167724818,
|
|
"learning_rate": 1.1388888888888888e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12055199.0,
|
|
"reward": 0.31926023960113525,
|
|
"reward_std": 0.12855856120586395,
|
|
"rewards/grpo_reward_func/mean": 0.31926023960113525,
|
|
"rewards/grpo_reward_func/std": 0.13934962451457977,
|
|
"step": 478
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.484375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.8125,
|
|
"kl": 0.0006446990446420386,
|
|
"learning_rate": 1.1296296296296296e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12080303.0,
|
|
"reward": 0.2243320643901825,
|
|
"reward_std": 0.10535985231399536,
|
|
"rewards/grpo_reward_func/mean": 0.2243320643901825,
|
|
"rewards/grpo_reward_func/std": 0.13603921234607697,
|
|
"step": 479
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.5,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.875,
|
|
"kl": 0.0008130024070851505,
|
|
"learning_rate": 1.1203703703703703e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12105327.0,
|
|
"reward": 0.3918301463127136,
|
|
"reward_std": 0.12956207990646362,
|
|
"rewards/grpo_reward_func/mean": 0.3918301463127136,
|
|
"rewards/grpo_reward_func/std": 0.14622493088245392,
|
|
"step": 480
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.515625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.0625,
|
|
"kl": 0.001358548819553107,
|
|
"learning_rate": 1.111111111111111e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 12130511.0,
|
|
"reward": 0.2851361334323883,
|
|
"reward_std": 0.1555725485086441,
|
|
"rewards/grpo_reward_func/mean": 0.2851361334323883,
|
|
"rewards/grpo_reward_func/std": 0.1634860336780548,
|
|
"step": 481
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.53125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.4375,
|
|
"kl": 0.0005101002752780914,
|
|
"learning_rate": 1.1018518518518519e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12156063.0,
|
|
"reward": 0.12298320978879929,
|
|
"reward_std": 0.05867529287934303,
|
|
"rewards/grpo_reward_func/mean": 0.12298320978879929,
|
|
"rewards/grpo_reward_func/std": 0.06098959594964981,
|
|
"step": 482
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.546875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.5,
|
|
"kl": 0.0007716265245107934,
|
|
"learning_rate": 1.0925925925925926e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12181463.0,
|
|
"reward": 0.149795800447464,
|
|
"reward_std": 0.10114938765764236,
|
|
"rewards/grpo_reward_func/mean": 0.149795800447464,
|
|
"rewards/grpo_reward_func/std": 0.11674728989601135,
|
|
"step": 483
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.5625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.6875,
|
|
"kl": 0.0016588344587944448,
|
|
"learning_rate": 1.0833333333333334e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 12206231.0,
|
|
"reward": 0.35547971725463867,
|
|
"reward_std": 0.12198421359062195,
|
|
"rewards/grpo_reward_func/mean": 0.35547971725463867,
|
|
"rewards/grpo_reward_func/std": 0.14273548126220703,
|
|
"step": 484
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.578125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.9375,
|
|
"kl": 0.0024268822162412107,
|
|
"learning_rate": 1.074074074074074e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 12231703.0,
|
|
"reward": 0.28086715936660767,
|
|
"reward_std": 0.13494156301021576,
|
|
"rewards/grpo_reward_func/mean": 0.28086715936660767,
|
|
"rewards/grpo_reward_func/std": 0.22161196172237396,
|
|
"step": 485
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.59375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.5625,
|
|
"kl": 0.0006409324705600739,
|
|
"learning_rate": 1.0648148148148148e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12256543.0,
|
|
"reward": 0.317976176738739,
|
|
"reward_std": 0.06494971364736557,
|
|
"rewards/grpo_reward_func/mean": 0.317976176738739,
|
|
"rewards/grpo_reward_func/std": 0.0717727318406105,
|
|
"step": 486
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.609375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.6875,
|
|
"kl": 0.0014852539461571723,
|
|
"learning_rate": 1.0555555555555555e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 12281863.0,
|
|
"reward": 0.18861307203769684,
|
|
"reward_std": 0.17945444583892822,
|
|
"rewards/grpo_reward_func/mean": 0.18861307203769684,
|
|
"rewards/grpo_reward_func/std": 0.1798945814371109,
|
|
"step": 487
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.8125,
|
|
"kl": 0.0006892156670801342,
|
|
"learning_rate": 1.0462962962962963e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12306919.0,
|
|
"reward": 0.30536431074142456,
|
|
"reward_std": 0.15281975269317627,
|
|
"rewards/grpo_reward_func/mean": 0.30536431074142456,
|
|
"rewards/grpo_reward_func/std": 0.17713572084903717,
|
|
"step": 488
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.640625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.6875,
|
|
"kl": 0.000986199505859986,
|
|
"learning_rate": 1.0370370370370369e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12332351.0,
|
|
"reward": 0.14482443034648895,
|
|
"reward_std": 0.17538149654865265,
|
|
"rewards/grpo_reward_func/mean": 0.14482443034648895,
|
|
"rewards/grpo_reward_func/std": 0.20862267911434174,
|
|
"step": 489
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.65625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.75,
|
|
"kl": 0.0005291861889418215,
|
|
"learning_rate": 1.0277777777777777e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12357783.0,
|
|
"reward": 0.24565353989601135,
|
|
"reward_std": 0.07573194801807404,
|
|
"rewards/grpo_reward_func/mean": 0.24565353989601135,
|
|
"rewards/grpo_reward_func/std": 0.17366138100624084,
|
|
"step": 490
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.671875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.625,
|
|
"kl": 0.0012580165202962235,
|
|
"learning_rate": 1.0185185185185184e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 12382783.0,
|
|
"reward": 0.28701528906822205,
|
|
"reward_std": 0.13948951661586761,
|
|
"rewards/grpo_reward_func/mean": 0.28701528906822205,
|
|
"rewards/grpo_reward_func/std": 0.1597539633512497,
|
|
"step": 491
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.6875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.4375,
|
|
"kl": 0.0010586930438876152,
|
|
"learning_rate": 1.0092592592592593e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12408327.0,
|
|
"reward": 0.15862220525741577,
|
|
"reward_std": 0.0548260323703289,
|
|
"rewards/grpo_reward_func/mean": 0.15862220525741577,
|
|
"rewards/grpo_reward_func/std": 0.12198863923549652,
|
|
"step": 492
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.703125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.4375,
|
|
"kl": 0.0013896317104808986,
|
|
"learning_rate": 1e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 12433679.0,
|
|
"reward": 0.32602596282958984,
|
|
"reward_std": 0.13098275661468506,
|
|
"rewards/grpo_reward_func/mean": 0.32602596282958984,
|
|
"rewards/grpo_reward_func/std": 0.2107134908437729,
|
|
"step": 493
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.71875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.75,
|
|
"kl": 0.0025586048141121864,
|
|
"learning_rate": 9.907407407407407e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 12458655.0,
|
|
"reward": 0.3703271150588989,
|
|
"reward_std": 0.09507998824119568,
|
|
"rewards/grpo_reward_func/mean": 0.3703271150588989,
|
|
"rewards/grpo_reward_func/std": 0.14610642194747925,
|
|
"step": 494
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.734375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.0625,
|
|
"kl": 0.0011041508987545967,
|
|
"learning_rate": 9.814814814814815e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 12483567.0,
|
|
"reward": 0.3221014738082886,
|
|
"reward_std": 0.06510960310697556,
|
|
"rewards/grpo_reward_func/mean": 0.3221014738082886,
|
|
"rewards/grpo_reward_func/std": 0.07591387629508972,
|
|
"step": 495
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.75,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.6875,
|
|
"kl": 0.0013404401834122837,
|
|
"learning_rate": 9.722222222222222e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 12508847.0,
|
|
"reward": 0.32358843088150024,
|
|
"reward_std": 0.14844445884227753,
|
|
"rewards/grpo_reward_func/mean": 0.32358843088150024,
|
|
"rewards/grpo_reward_func/std": 0.14955370128154755,
|
|
"step": 496
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.765625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.1875,
|
|
"kl": 0.0013865028013242409,
|
|
"learning_rate": 9.629629629629629e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 12534183.0,
|
|
"reward": 0.29313117265701294,
|
|
"reward_std": 0.05017620697617531,
|
|
"rewards/grpo_reward_func/mean": 0.29313117265701294,
|
|
"rewards/grpo_reward_func/std": 0.11477138102054596,
|
|
"step": 497
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.78125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.5625,
|
|
"kl": 0.001455625839298591,
|
|
"learning_rate": 9.537037037037036e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 12559055.0,
|
|
"reward": 0.32869526743888855,
|
|
"reward_std": 0.11172797530889511,
|
|
"rewards/grpo_reward_func/mean": 0.32869526743888855,
|
|
"rewards/grpo_reward_func/std": 0.12100888043642044,
|
|
"step": 498
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.796875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.375,
|
|
"kl": 0.000938947923714295,
|
|
"learning_rate": 9.444444444444444e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 12583935.0,
|
|
"reward": 0.41716277599334717,
|
|
"reward_std": 0.1078467071056366,
|
|
"rewards/grpo_reward_func/mean": 0.41716277599334717,
|
|
"rewards/grpo_reward_func/std": 0.11082032322883606,
|
|
"step": 499
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.8125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.25,
|
|
"kl": 0.0005020884127588943,
|
|
"learning_rate": 9.351851851851851e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 12609103.0,
|
|
"reward": 0.2469130903482437,
|
|
"reward_std": 0.07369641214609146,
|
|
"rewards/grpo_reward_func/mean": 0.2469130903482437,
|
|
"rewards/grpo_reward_func/std": 0.13795331120491028,
|
|
"step": 500
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.828125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.0,
|
|
"kl": 0.0005307138635544106,
|
|
"learning_rate": 9.259259259259258e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 12634127.0,
|
|
"reward": 0.404737651348114,
|
|
"reward_std": 0.14841783046722412,
|
|
"rewards/grpo_reward_func/mean": 0.404737651348114,
|
|
"rewards/grpo_reward_func/std": 0.15671268105506897,
|
|
"step": 501
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.84375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.6875,
|
|
"kl": 0.000757849462388549,
|
|
"learning_rate": 9.166666666666665e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 12659767.0,
|
|
"reward": 0.14257104694843292,
|
|
"reward_std": 0.07553324848413467,
|
|
"rewards/grpo_reward_func/mean": 0.14257104694843292,
|
|
"rewards/grpo_reward_func/std": 0.09304811805486679,
|
|
"step": 502
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.859375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.3125,
|
|
"kl": 0.001406958035659045,
|
|
"learning_rate": 9.074074074074074e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 12685103.0,
|
|
"reward": 0.2559998333454132,
|
|
"reward_std": 0.12699110805988312,
|
|
"rewards/grpo_reward_func/mean": 0.2559998333454132,
|
|
"rewards/grpo_reward_func/std": 0.2978763282299042,
|
|
"step": 503
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.125,
|
|
"kl": 0.0007011145353317261,
|
|
"learning_rate": 8.981481481481482e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 12709967.0,
|
|
"reward": 0.4325105547904968,
|
|
"reward_std": 0.07664240151643753,
|
|
"rewards/grpo_reward_func/mean": 0.4325105547904968,
|
|
"rewards/grpo_reward_func/std": 0.1956201046705246,
|
|
"step": 504
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.890625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.6875,
|
|
"kl": 0.002911916351877153,
|
|
"learning_rate": 8.888888888888888e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 12734967.0,
|
|
"reward": 0.34496262669563293,
|
|
"reward_std": 0.12534165382385254,
|
|
"rewards/grpo_reward_func/mean": 0.34496262669563293,
|
|
"rewards/grpo_reward_func/std": 0.12190457433462143,
|
|
"step": 505
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.90625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.875,
|
|
"kl": 0.0007352257671300322,
|
|
"learning_rate": 8.796296296296296e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 12760111.0,
|
|
"reward": 0.3680327236652374,
|
|
"reward_std": 0.07478898763656616,
|
|
"rewards/grpo_reward_func/mean": 0.3680327236652374,
|
|
"rewards/grpo_reward_func/std": 0.1823076754808426,
|
|
"step": 506
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.921875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.5625,
|
|
"kl": 0.0011474639468360692,
|
|
"learning_rate": 8.703703703703703e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 12785271.0,
|
|
"reward": 0.19895240664482117,
|
|
"reward_std": 0.16426034271717072,
|
|
"rewards/grpo_reward_func/mean": 0.19895240664482117,
|
|
"rewards/grpo_reward_func/std": 0.18529564142227173,
|
|
"step": 507
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.9375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.0,
|
|
"kl": 0.0006676611083094031,
|
|
"learning_rate": 8.611111111111111e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 12810359.0,
|
|
"reward": 0.21774883568286896,
|
|
"reward_std": 0.11167119443416595,
|
|
"rewards/grpo_reward_func/mean": 0.21774883568286896,
|
|
"rewards/grpo_reward_func/std": 0.19753259420394897,
|
|
"step": 508
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.953125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.125,
|
|
"kl": 0.0012378571555018425,
|
|
"learning_rate": 8.518518518518517e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 12835495.0,
|
|
"reward": 0.21574603021144867,
|
|
"reward_std": 0.16766542196273804,
|
|
"rewards/grpo_reward_func/mean": 0.21574603021144867,
|
|
"rewards/grpo_reward_func/std": 0.16853067278862,
|
|
"step": 509
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.96875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.25,
|
|
"kl": 0.0005606446939054877,
|
|
"learning_rate": 8.425925925925925e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 12860463.0,
|
|
"reward": 0.3914608359336853,
|
|
"reward_std": 0.127981498837471,
|
|
"rewards/grpo_reward_func/mean": 0.3914608359336853,
|
|
"rewards/grpo_reward_func/std": 0.17107483744621277,
|
|
"step": 510
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 7.984375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.375,
|
|
"kl": 0.0006222097872523591,
|
|
"learning_rate": 8.333333333333333e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 12885399.0,
|
|
"reward": 0.37283188104629517,
|
|
"reward_std": 0.1533588469028473,
|
|
"rewards/grpo_reward_func/mean": 0.37283188104629517,
|
|
"rewards/grpo_reward_func/std": 0.16130177676677704,
|
|
"step": 511
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.0,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.4375,
|
|
"kl": 0.0014068341115489602,
|
|
"learning_rate": 8.24074074074074e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 12910527.0,
|
|
"reward": 0.40698957443237305,
|
|
"reward_std": 0.09445726871490479,
|
|
"rewards/grpo_reward_func/mean": 0.40698957443237305,
|
|
"rewards/grpo_reward_func/std": 0.11586925387382507,
|
|
"step": 512
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.015625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.375,
|
|
"kl": 0.001016762078506872,
|
|
"learning_rate": 8.148148148148149e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 12936015.0,
|
|
"reward": 0.28052544593811035,
|
|
"reward_std": 0.10798147320747375,
|
|
"rewards/grpo_reward_func/mean": 0.28052544593811035,
|
|
"rewards/grpo_reward_func/std": 0.2348252683877945,
|
|
"step": 513
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.03125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.0,
|
|
"kl": 0.0010810171370394528,
|
|
"learning_rate": 8.055555555555555e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 12961375.0,
|
|
"reward": 0.4133640229701996,
|
|
"reward_std": 0.1525183767080307,
|
|
"rewards/grpo_reward_func/mean": 0.4133640229701996,
|
|
"rewards/grpo_reward_func/std": 0.1891576498746872,
|
|
"step": 514
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.046875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.4375,
|
|
"kl": 0.0010025454976130277,
|
|
"learning_rate": 7.962962962962963e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 12986327.0,
|
|
"reward": 0.43144044280052185,
|
|
"reward_std": 0.09400911629199982,
|
|
"rewards/grpo_reward_func/mean": 0.43144044280052185,
|
|
"rewards/grpo_reward_func/std": 0.15548229217529297,
|
|
"step": 515
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.0625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.375,
|
|
"kl": 0.0005061101837782189,
|
|
"learning_rate": 7.87037037037037e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13011303.0,
|
|
"reward": 0.42609304189682007,
|
|
"reward_std": 0.16274592280387878,
|
|
"rewards/grpo_reward_func/mean": 0.42609304189682007,
|
|
"rewards/grpo_reward_func/std": 0.18850740790367126,
|
|
"step": 516
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.078125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.875,
|
|
"kl": 0.0012816967209801078,
|
|
"learning_rate": 7.777777777777778e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 13036479.0,
|
|
"reward": 0.29316410422325134,
|
|
"reward_std": 0.1588841676712036,
|
|
"rewards/grpo_reward_func/mean": 0.29316410422325134,
|
|
"rewards/grpo_reward_func/std": 0.25689324736595154,
|
|
"step": 517
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.09375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.0,
|
|
"kl": 0.0006455080583691597,
|
|
"learning_rate": 7.685185185185184e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13061303.0,
|
|
"reward": 0.3597118854522705,
|
|
"reward_std": 0.15937989950180054,
|
|
"rewards/grpo_reward_func/mean": 0.3597118854522705,
|
|
"rewards/grpo_reward_func/std": 0.2896607220172882,
|
|
"step": 518
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.109375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.75,
|
|
"kl": 0.0010962964443024248,
|
|
"learning_rate": 7.592592592592592e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13086415.0,
|
|
"reward": 0.34907934069633484,
|
|
"reward_std": 0.1042337566614151,
|
|
"rewards/grpo_reward_func/mean": 0.34907934069633484,
|
|
"rewards/grpo_reward_func/std": 0.12016221135854721,
|
|
"step": 519
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.6875,
|
|
"kl": 0.0007564701663795859,
|
|
"learning_rate": 7.5e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13112343.0,
|
|
"reward": 0.10363875329494476,
|
|
"reward_std": 0.06262201070785522,
|
|
"rewards/grpo_reward_func/mean": 0.10363875329494476,
|
|
"rewards/grpo_reward_func/std": 0.0699472650885582,
|
|
"step": 520
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.140625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.375,
|
|
"kl": 0.0012816584785468876,
|
|
"learning_rate": 7.407407407407407e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 13137687.0,
|
|
"reward": 0.23938968777656555,
|
|
"reward_std": 0.1415923833847046,
|
|
"rewards/grpo_reward_func/mean": 0.23938968777656555,
|
|
"rewards/grpo_reward_func/std": 0.2066323161125183,
|
|
"step": 521
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.15625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.6875,
|
|
"kl": 0.0007490174029953778,
|
|
"learning_rate": 7.314814814814814e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13162671.0,
|
|
"reward": 0.3724169135093689,
|
|
"reward_std": 0.1446247100830078,
|
|
"rewards/grpo_reward_func/mean": 0.3724169135093689,
|
|
"rewards/grpo_reward_func/std": 0.1447208970785141,
|
|
"step": 522
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.171875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.0625,
|
|
"kl": 0.0015134557033888996,
|
|
"learning_rate": 7.222222222222221e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 13187423.0,
|
|
"reward": 0.3491423428058624,
|
|
"reward_std": 0.10657566785812378,
|
|
"rewards/grpo_reward_func/mean": 0.3491423428058624,
|
|
"rewards/grpo_reward_func/std": 0.12669718265533447,
|
|
"step": 523
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.1875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.75,
|
|
"kl": 0.000962267949944362,
|
|
"learning_rate": 7.12962962962963e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13212367.0,
|
|
"reward": 0.35430532693862915,
|
|
"reward_std": 0.05485477298498154,
|
|
"rewards/grpo_reward_func/mean": 0.35430532693862915,
|
|
"rewards/grpo_reward_func/std": 0.05748599022626877,
|
|
"step": 524
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.203125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.875,
|
|
"kl": 0.0007257629185914993,
|
|
"learning_rate": 7.037037037037038e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13237967.0,
|
|
"reward": 0.15453723073005676,
|
|
"reward_std": 0.1196342259645462,
|
|
"rewards/grpo_reward_func/mean": 0.15453723073005676,
|
|
"rewards/grpo_reward_func/std": 0.13181230425834656,
|
|
"step": 525
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.21875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.0,
|
|
"kl": 0.0010517633927520365,
|
|
"learning_rate": 6.944444444444444e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13263071.0,
|
|
"reward": 0.2398907095193863,
|
|
"reward_std": 0.08424936234951019,
|
|
"rewards/grpo_reward_func/mean": 0.2398907095193863,
|
|
"rewards/grpo_reward_func/std": 0.13464602828025818,
|
|
"step": 526
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.234375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.125,
|
|
"kl": 0.0009952950931619853,
|
|
"learning_rate": 6.851851851851852e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13288159.0,
|
|
"reward": 0.32014644145965576,
|
|
"reward_std": 0.11366228759288788,
|
|
"rewards/grpo_reward_func/mean": 0.32014644145965576,
|
|
"rewards/grpo_reward_func/std": 0.1127706915140152,
|
|
"step": 527
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.25,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.875,
|
|
"kl": 0.0003101810143562034,
|
|
"learning_rate": 6.759259259259259e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13313511.0,
|
|
"reward": 0.30525702238082886,
|
|
"reward_std": 0.1390095353126526,
|
|
"rewards/grpo_reward_func/mean": 0.30525702238082886,
|
|
"rewards/grpo_reward_func/std": 0.15475143492221832,
|
|
"step": 528
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.265625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.1875,
|
|
"kl": 0.0013495491002686322,
|
|
"learning_rate": 6.666666666666667e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 13339079.0,
|
|
"reward": 0.1646089255809784,
|
|
"reward_std": 0.06642314791679382,
|
|
"rewards/grpo_reward_func/mean": 0.1646089255809784,
|
|
"rewards/grpo_reward_func/std": 0.06487837433815002,
|
|
"step": 529
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.28125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.1875,
|
|
"kl": 0.0018139145686291158,
|
|
"learning_rate": 6.574074074074073e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 13364703.0,
|
|
"reward": 0.21735727787017822,
|
|
"reward_std": 0.10561822354793549,
|
|
"rewards/grpo_reward_func/mean": 0.21735727787017822,
|
|
"rewards/grpo_reward_func/std": 0.20538462698459625,
|
|
"step": 530
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.296875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.625,
|
|
"kl": 0.0035593643551692367,
|
|
"learning_rate": 6.481481481481481e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 13389807.0,
|
|
"reward": 0.2676432132720947,
|
|
"reward_std": 0.10668302327394485,
|
|
"rewards/grpo_reward_func/mean": 0.2676432132720947,
|
|
"rewards/grpo_reward_func/std": 0.15589383244514465,
|
|
"step": 531
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.3125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.875,
|
|
"kl": 0.001778287230990827,
|
|
"learning_rate": 6.388888888888888e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 13415567.0,
|
|
"reward": 0.17674584686756134,
|
|
"reward_std": 0.09293629974126816,
|
|
"rewards/grpo_reward_func/mean": 0.17674584686756134,
|
|
"rewards/grpo_reward_func/std": 0.1887131929397583,
|
|
"step": 532
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.328125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.1875,
|
|
"kl": 0.000783468916779384,
|
|
"learning_rate": 6.296296296296296e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13440455.0,
|
|
"reward": 0.24244914948940277,
|
|
"reward_std": 0.07908381521701813,
|
|
"rewards/grpo_reward_func/mean": 0.24244914948940277,
|
|
"rewards/grpo_reward_func/std": 0.08193695545196533,
|
|
"step": 533
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.34375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.875,
|
|
"kl": 0.0005902101693209261,
|
|
"learning_rate": 6.203703703703704e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13465431.0,
|
|
"reward": 0.42329832911491394,
|
|
"reward_std": 0.09570271521806717,
|
|
"rewards/grpo_reward_func/mean": 0.42329832911491394,
|
|
"rewards/grpo_reward_func/std": 0.18416033685207367,
|
|
"step": 534
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.359375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.75,
|
|
"kl": 0.0003727572038769722,
|
|
"learning_rate": 6.111111111111111e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13490463.0,
|
|
"reward": 0.38040339946746826,
|
|
"reward_std": 0.04217088967561722,
|
|
"rewards/grpo_reward_func/mean": 0.38040339946746826,
|
|
"rewards/grpo_reward_func/std": 0.05930864065885544,
|
|
"step": 535
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.4375,
|
|
"kl": 0.001078493587556295,
|
|
"learning_rate": 6.018518518518517e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13515199.0,
|
|
"reward": 0.39346325397491455,
|
|
"reward_std": 0.05055631697177887,
|
|
"rewards/grpo_reward_func/mean": 0.39346325397491455,
|
|
"rewards/grpo_reward_func/std": 0.06339211761951447,
|
|
"step": 536
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.390625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.6875,
|
|
"kl": 0.0013342679594643414,
|
|
"learning_rate": 5.925925925925926e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 13540535.0,
|
|
"reward": 0.31230786442756653,
|
|
"reward_std": 0.12040304392576218,
|
|
"rewards/grpo_reward_func/mean": 0.31230786442756653,
|
|
"rewards/grpo_reward_func/std": 0.2622143626213074,
|
|
"step": 537
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.40625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.0625,
|
|
"kl": 0.002153427602024749,
|
|
"learning_rate": 5.833333333333333e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 13565415.0,
|
|
"reward": 0.3166807293891907,
|
|
"reward_std": 0.10012571513652802,
|
|
"rewards/grpo_reward_func/mean": 0.3166807293891907,
|
|
"rewards/grpo_reward_func/std": 0.10676856338977814,
|
|
"step": 538
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.421875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.0625,
|
|
"kl": 0.0007572459580842406,
|
|
"learning_rate": 5.74074074074074e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13591375.0,
|
|
"reward": 0.0935957282781601,
|
|
"reward_std": 0.09053429961204529,
|
|
"rewards/grpo_reward_func/mean": 0.0935957282781601,
|
|
"rewards/grpo_reward_func/std": 0.09107687324285507,
|
|
"step": 539
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.4375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.875,
|
|
"kl": 0.0015325732820201665,
|
|
"learning_rate": 5.648148148148148e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 13616223.0,
|
|
"reward": 0.40333813428878784,
|
|
"reward_std": 0.06387582421302795,
|
|
"rewards/grpo_reward_func/mean": 0.40333813428878784,
|
|
"rewards/grpo_reward_func/std": 0.15431678295135498,
|
|
"step": 540
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.453125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.5,
|
|
"kl": 0.0007481267966795713,
|
|
"learning_rate": 5.555555555555555e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13641055.0,
|
|
"reward": 0.45801687240600586,
|
|
"reward_std": 0.08247587084770203,
|
|
"rewards/grpo_reward_func/mean": 0.45801687240600586,
|
|
"rewards/grpo_reward_func/std": 0.08276832848787308,
|
|
"step": 541
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.46875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.1875,
|
|
"kl": 0.0021362415864132345,
|
|
"learning_rate": 5.462962962962963e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 13665991.0,
|
|
"reward": 0.4474378824234009,
|
|
"reward_std": 0.13480228185653687,
|
|
"rewards/grpo_reward_func/mean": 0.4474378824234009,
|
|
"rewards/grpo_reward_func/std": 0.1332484483718872,
|
|
"step": 542
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.484375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.875,
|
|
"kl": 0.0008189262007363141,
|
|
"learning_rate": 5.37037037037037e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13691007.0,
|
|
"reward": 0.30340880155563354,
|
|
"reward_std": 0.15250803530216217,
|
|
"rewards/grpo_reward_func/mean": 0.30340880155563354,
|
|
"rewards/grpo_reward_func/std": 0.21679265797138214,
|
|
"step": 543
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.5,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 23.75,
|
|
"kl": 0.003065172815695405,
|
|
"learning_rate": 5.2777777777777776e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 13716575.0,
|
|
"reward": 0.2417331337928772,
|
|
"reward_std": 0.14743976294994354,
|
|
"rewards/grpo_reward_func/mean": 0.2417331337928772,
|
|
"rewards/grpo_reward_func/std": 0.2199942171573639,
|
|
"step": 544
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.515625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.9375,
|
|
"kl": 0.0016453542630188167,
|
|
"learning_rate": 5.1851851851851846e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 13742175.0,
|
|
"reward": 0.16955968737602234,
|
|
"reward_std": 0.1037115603685379,
|
|
"rewards/grpo_reward_func/mean": 0.16955968737602234,
|
|
"rewards/grpo_reward_func/std": 0.16303950548171997,
|
|
"step": 545
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.53125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.375,
|
|
"kl": 0.000976884097326547,
|
|
"learning_rate": 5.092592592592592e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13767983.0,
|
|
"reward": 0.16923588514328003,
|
|
"reward_std": 0.1803411841392517,
|
|
"rewards/grpo_reward_func/mean": 0.16923588514328003,
|
|
"rewards/grpo_reward_func/std": 0.19898511469364166,
|
|
"step": 546
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.546875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.5,
|
|
"kl": 0.0012220792996231467,
|
|
"learning_rate": 5e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13792751.0,
|
|
"reward": 0.40972161293029785,
|
|
"reward_std": 0.12056250870227814,
|
|
"rewards/grpo_reward_func/mean": 0.40972161293029785,
|
|
"rewards/grpo_reward_func/std": 0.11885331571102142,
|
|
"step": 547
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.5625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.6875,
|
|
"kl": 0.0010280332644470036,
|
|
"learning_rate": 4.9074074074074074e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13817871.0,
|
|
"reward": 0.4193640649318695,
|
|
"reward_std": 0.10699457675218582,
|
|
"rewards/grpo_reward_func/mean": 0.4193640649318695,
|
|
"rewards/grpo_reward_func/std": 0.10446963459253311,
|
|
"step": 548
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.578125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.6875,
|
|
"kl": 0.0008616912818979472,
|
|
"learning_rate": 4.814814814814814e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13843359.0,
|
|
"reward": 0.2062494158744812,
|
|
"reward_std": 0.1384025514125824,
|
|
"rewards/grpo_reward_func/mean": 0.2062494158744812,
|
|
"rewards/grpo_reward_func/std": 0.23651915788650513,
|
|
"step": 549
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.59375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.8125,
|
|
"kl": 0.0014753906289115548,
|
|
"learning_rate": 4.722222222222222e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 13868591.0,
|
|
"reward": 0.30149251222610474,
|
|
"reward_std": 0.1228085309267044,
|
|
"rewards/grpo_reward_func/mean": 0.30149251222610474,
|
|
"rewards/grpo_reward_func/std": 0.2368825227022171,
|
|
"step": 550
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.609375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.6875,
|
|
"kl": 0.0016866040241438895,
|
|
"learning_rate": 4.629629629629629e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 13893495.0,
|
|
"reward": 0.39722514152526855,
|
|
"reward_std": 0.0929851084947586,
|
|
"rewards/grpo_reward_func/mean": 0.39722514152526855,
|
|
"rewards/grpo_reward_func/std": 0.09578373283147812,
|
|
"step": 551
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.53125,
|
|
"kl": 0.0008672124822624028,
|
|
"learning_rate": 4.537037037037037e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13919215.0,
|
|
"reward": 0.15501129627227783,
|
|
"reward_std": 0.05672089755535126,
|
|
"rewards/grpo_reward_func/mean": 0.15501129627227783,
|
|
"rewards/grpo_reward_func/std": 0.06247472018003464,
|
|
"step": 552
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.640625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.6875,
|
|
"kl": 0.0009078836592379957,
|
|
"learning_rate": 4.444444444444444e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13944183.0,
|
|
"reward": 0.29007142782211304,
|
|
"reward_std": 0.13152144849300385,
|
|
"rewards/grpo_reward_func/mean": 0.29007142782211304,
|
|
"rewards/grpo_reward_func/std": 0.14510175585746765,
|
|
"step": 553
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.65625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.34375,
|
|
"kl": 0.0005848153232363984,
|
|
"learning_rate": 4.351851851851852e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13969375.0,
|
|
"reward": 0.37974289059638977,
|
|
"reward_std": 0.09775005280971527,
|
|
"rewards/grpo_reward_func/mean": 0.37974289059638977,
|
|
"rewards/grpo_reward_func/std": 0.19289840757846832,
|
|
"step": 554
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.671875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.8125,
|
|
"kl": 0.00016175458586076275,
|
|
"learning_rate": 4.2592592592592586e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 13994311.0,
|
|
"reward": 0.41699835658073425,
|
|
"reward_std": 0.056771546602249146,
|
|
"rewards/grpo_reward_func/mean": 0.41699835658073425,
|
|
"rewards/grpo_reward_func/std": 0.1137986108660698,
|
|
"step": 555
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.6875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.5,
|
|
"kl": 0.0013509286800399423,
|
|
"learning_rate": 4.166666666666666e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 14019495.0,
|
|
"reward": 0.3929343521595001,
|
|
"reward_std": 0.14318805932998657,
|
|
"rewards/grpo_reward_func/mean": 0.3929343521595001,
|
|
"rewards/grpo_reward_func/std": 0.2004500776529312,
|
|
"step": 556
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.703125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.5625,
|
|
"kl": 0.0014943527057766914,
|
|
"learning_rate": 4.0740740740740745e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 14044207.0,
|
|
"reward": 0.4945647716522217,
|
|
"reward_std": 0.1458444595336914,
|
|
"rewards/grpo_reward_func/mean": 0.4945647716522217,
|
|
"rewards/grpo_reward_func/std": 0.14586947858333588,
|
|
"step": 557
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.71875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.125,
|
|
"kl": 0.0004489005805226043,
|
|
"learning_rate": 3.9814814814814815e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14069927.0,
|
|
"reward": 0.07546406239271164,
|
|
"reward_std": 0.09835518896579742,
|
|
"rewards/grpo_reward_func/mean": 0.07546406239271164,
|
|
"rewards/grpo_reward_func/std": 0.09766824543476105,
|
|
"step": 558
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.734375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.96875,
|
|
"kl": 0.0015613465220667422,
|
|
"learning_rate": 3.888888888888889e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 14095207.0,
|
|
"reward": 0.42614489793777466,
|
|
"reward_std": 0.12422450631856918,
|
|
"rewards/grpo_reward_func/mean": 0.42614489793777466,
|
|
"rewards/grpo_reward_func/std": 0.17880021035671234,
|
|
"step": 559
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.75,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.5625,
|
|
"kl": 0.0030846113804727793,
|
|
"learning_rate": 3.796296296296296e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 14120207.0,
|
|
"reward": 0.2813106179237366,
|
|
"reward_std": 0.12345054000616074,
|
|
"rewards/grpo_reward_func/mean": 0.2813106179237366,
|
|
"rewards/grpo_reward_func/std": 0.1639048308134079,
|
|
"step": 560
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.765625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.375,
|
|
"kl": 0.0009916585986502469,
|
|
"learning_rate": 3.7037037037037036e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14145495.0,
|
|
"reward": 0.2581431269645691,
|
|
"reward_std": 0.12751302123069763,
|
|
"rewards/grpo_reward_func/mean": 0.2581431269645691,
|
|
"rewards/grpo_reward_func/std": 0.1969458907842636,
|
|
"step": 561
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.78125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.8125,
|
|
"kl": 0.0007693757943343371,
|
|
"learning_rate": 3.6111111111111106e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14170759.0,
|
|
"reward": 0.3326881229877472,
|
|
"reward_std": 0.12007895112037659,
|
|
"rewards/grpo_reward_func/mean": 0.3326881229877472,
|
|
"rewards/grpo_reward_func/std": 0.1920982152223587,
|
|
"step": 562
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.796875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.0,
|
|
"kl": 0.0011726654774975032,
|
|
"learning_rate": 3.518518518518519e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14196303.0,
|
|
"reward": 0.08724980056285858,
|
|
"reward_std": 0.11998534202575684,
|
|
"rewards/grpo_reward_func/mean": 0.08724980056285858,
|
|
"rewards/grpo_reward_func/std": 0.1232389286160469,
|
|
"step": 563
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.8125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.75,
|
|
"kl": 0.00046126171946525574,
|
|
"learning_rate": 3.425925925925926e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14221079.0,
|
|
"reward": 0.3096466362476349,
|
|
"reward_std": 0.06632121652364731,
|
|
"rewards/grpo_reward_func/mean": 0.3096466362476349,
|
|
"rewards/grpo_reward_func/std": 0.10011015087366104,
|
|
"step": 564
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.828125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.9375,
|
|
"kl": 0.0005951582861598581,
|
|
"learning_rate": 3.3333333333333334e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14246583.0,
|
|
"reward": 0.2074100375175476,
|
|
"reward_std": 0.24280470609664917,
|
|
"rewards/grpo_reward_func/mean": 0.2074100375175476,
|
|
"rewards/grpo_reward_func/std": 0.24377292394638062,
|
|
"step": 565
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.84375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.3125,
|
|
"kl": 0.0016201141115743667,
|
|
"learning_rate": 3.2407407407407403e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 14271647.0,
|
|
"reward": 0.2839413583278656,
|
|
"reward_std": 0.10143469274044037,
|
|
"rewards/grpo_reward_func/mean": 0.2839413583278656,
|
|
"rewards/grpo_reward_func/std": 0.12659397721290588,
|
|
"step": 566
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.859375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.375,
|
|
"kl": 0.0010330639779567719,
|
|
"learning_rate": 3.148148148148148e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14296583.0,
|
|
"reward": 0.322548508644104,
|
|
"reward_std": 0.09603136777877808,
|
|
"rewards/grpo_reward_func/mean": 0.322548508644104,
|
|
"rewards/grpo_reward_func/std": 0.09745854139328003,
|
|
"step": 567
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.75,
|
|
"kl": 0.00181583222001791,
|
|
"learning_rate": 3.0555555555555556e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 14321959.0,
|
|
"reward": 0.20520664751529694,
|
|
"reward_std": 0.10753442347049713,
|
|
"rewards/grpo_reward_func/mean": 0.20520664751529694,
|
|
"rewards/grpo_reward_func/std": 0.21462036669254303,
|
|
"step": 568
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.890625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 17.5,
|
|
"kl": 0.002053163305390626,
|
|
"learning_rate": 2.962962962962963e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 14347727.0,
|
|
"reward": 0.07180184125900269,
|
|
"reward_std": 0.12105913460254669,
|
|
"rewards/grpo_reward_func/mean": 0.07180184125900269,
|
|
"rewards/grpo_reward_func/std": 0.11819092184305191,
|
|
"step": 569
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.90625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.0625,
|
|
"kl": 0.0011411278101149946,
|
|
"learning_rate": 2.87037037037037e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14372879.0,
|
|
"reward": 0.34137165546417236,
|
|
"reward_std": 0.13372355699539185,
|
|
"rewards/grpo_reward_func/mean": 0.34137165546417236,
|
|
"rewards/grpo_reward_func/std": 0.15903595089912415,
|
|
"step": 570
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.921875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.8125,
|
|
"kl": 0.0012179824407212436,
|
|
"learning_rate": 2.7777777777777774e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14397903.0,
|
|
"reward": 0.32042965292930603,
|
|
"reward_std": 0.10691528767347336,
|
|
"rewards/grpo_reward_func/mean": 0.32042965292930603,
|
|
"rewards/grpo_reward_func/std": 0.13944755494594574,
|
|
"step": 571
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.9375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.9375,
|
|
"kl": 0.0027775077614933252,
|
|
"learning_rate": 2.685185185185185e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 14422735.0,
|
|
"reward": 0.41792985796928406,
|
|
"reward_std": 0.18040655553340912,
|
|
"rewards/grpo_reward_func/mean": 0.41792985796928406,
|
|
"rewards/grpo_reward_func/std": 0.1744299978017807,
|
|
"step": 572
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.953125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.375,
|
|
"kl": 0.0006387926114257425,
|
|
"learning_rate": 2.5925925925925923e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14448311.0,
|
|
"reward": 0.20609521865844727,
|
|
"reward_std": 0.05411346256732941,
|
|
"rewards/grpo_reward_func/mean": 0.20609521865844727,
|
|
"rewards/grpo_reward_func/std": 0.19829832017421722,
|
|
"step": 573
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.96875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.0,
|
|
"kl": 0.0012447184999473393,
|
|
"learning_rate": 2.5e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14474111.0,
|
|
"reward": 0.10689907521009445,
|
|
"reward_std": 0.11656990647315979,
|
|
"rewards/grpo_reward_func/mean": 0.10689907521009445,
|
|
"rewards/grpo_reward_func/std": 0.12091051787137985,
|
|
"step": 574
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 8.984375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.5625,
|
|
"kl": 0.0006491482927231118,
|
|
"learning_rate": 2.407407407407407e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14498855.0,
|
|
"reward": 0.34690797328948975,
|
|
"reward_std": 0.08506066352128983,
|
|
"rewards/grpo_reward_func/mean": 0.34690797328948975,
|
|
"rewards/grpo_reward_func/std": 0.0848422721028328,
|
|
"step": 575
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.0,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.1875,
|
|
"kl": 0.0013912487775087357,
|
|
"learning_rate": 2.3148148148148144e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 14524343.0,
|
|
"reward": 0.2778702676296234,
|
|
"reward_std": 0.08481252193450928,
|
|
"rewards/grpo_reward_func/mean": 0.2778702676296234,
|
|
"rewards/grpo_reward_func/std": 0.25761204957962036,
|
|
"step": 576
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.015625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.4375,
|
|
"kl": 0.0012237662449479103,
|
|
"learning_rate": 2.222222222222222e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14549311.0,
|
|
"reward": 0.4252242147922516,
|
|
"reward_std": 0.15082389116287231,
|
|
"rewards/grpo_reward_func/mean": 0.4252242147922516,
|
|
"rewards/grpo_reward_func/std": 0.19061405956745148,
|
|
"step": 577
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.03125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.75,
|
|
"kl": 0.001242486119735986,
|
|
"learning_rate": 2.1296296296296293e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14574871.0,
|
|
"reward": 0.1958128809928894,
|
|
"reward_std": 0.11874909698963165,
|
|
"rewards/grpo_reward_func/mean": 0.1958128809928894,
|
|
"rewards/grpo_reward_func/std": 0.19743791222572327,
|
|
"step": 578
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.046875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.5,
|
|
"kl": 0.0009185175294987857,
|
|
"learning_rate": 2.0370370370370373e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14599783.0,
|
|
"reward": 0.34701311588287354,
|
|
"reward_std": 0.10938475281000137,
|
|
"rewards/grpo_reward_func/mean": 0.34701311588287354,
|
|
"rewards/grpo_reward_func/std": 0.13787339627742767,
|
|
"step": 579
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.0625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.625,
|
|
"kl": 0.0007825860229786485,
|
|
"learning_rate": 1.9444444444444445e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14624775.0,
|
|
"reward": 0.3385382890701294,
|
|
"reward_std": 0.12184540182352066,
|
|
"rewards/grpo_reward_func/mean": 0.3385382890701294,
|
|
"rewards/grpo_reward_func/std": 0.12013304233551025,
|
|
"step": 580
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.078125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.0,
|
|
"kl": 0.0004937490448355675,
|
|
"learning_rate": 1.8518518518518518e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14649759.0,
|
|
"reward": 0.3188565969467163,
|
|
"reward_std": 0.0923345610499382,
|
|
"rewards/grpo_reward_func/mean": 0.3188565969467163,
|
|
"rewards/grpo_reward_func/std": 0.1829162985086441,
|
|
"step": 581
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.09375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.75,
|
|
"kl": 0.0009138956665992737,
|
|
"learning_rate": 1.7592592592592594e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14675047.0,
|
|
"reward": 0.2307702898979187,
|
|
"reward_std": 0.08643310517072678,
|
|
"rewards/grpo_reward_func/mean": 0.2307702898979187,
|
|
"rewards/grpo_reward_func/std": 0.11515053361654282,
|
|
"step": 582
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.109375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 21.125,
|
|
"kl": 0.0005873590707778931,
|
|
"learning_rate": 1.6666666666666667e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14699943.0,
|
|
"reward": 0.39582157135009766,
|
|
"reward_std": 0.12089787423610687,
|
|
"rewards/grpo_reward_func/mean": 0.39582157135009766,
|
|
"rewards/grpo_reward_func/std": 0.1251089870929718,
|
|
"step": 583
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.125,
|
|
"kl": 0.00180981180164963,
|
|
"learning_rate": 1.574074074074074e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 14724911.0,
|
|
"reward": 0.42574357986450195,
|
|
"reward_std": 0.12509074807167053,
|
|
"rewards/grpo_reward_func/mean": 0.42574357986450195,
|
|
"rewards/grpo_reward_func/std": 0.13018791377544403,
|
|
"step": 584
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.140625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.6875,
|
|
"kl": 0.0008814459142740816,
|
|
"learning_rate": 1.4814814814814814e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14750199.0,
|
|
"reward": 0.21241606771945953,
|
|
"reward_std": 0.0747273787856102,
|
|
"rewards/grpo_reward_func/mean": 0.21241606771945953,
|
|
"rewards/grpo_reward_func/std": 0.1790691465139389,
|
|
"step": 585
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.15625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.3125,
|
|
"kl": 0.0019009755342267454,
|
|
"learning_rate": 1.3888888888888887e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 14775255.0,
|
|
"reward": 0.43149012327194214,
|
|
"reward_std": 0.11503390967845917,
|
|
"rewards/grpo_reward_func/mean": 0.43149012327194214,
|
|
"rewards/grpo_reward_func/std": 0.14789779484272003,
|
|
"step": 586
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.171875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.5,
|
|
"kl": 0.0011376099428161979,
|
|
"learning_rate": 1.2962962962962961e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14800007.0,
|
|
"reward": 0.42513400316238403,
|
|
"reward_std": 0.10579125583171844,
|
|
"rewards/grpo_reward_func/mean": 0.42513400316238403,
|
|
"rewards/grpo_reward_func/std": 0.10767898708581924,
|
|
"step": 587
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.1875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.4375,
|
|
"kl": 0.0003657530469354242,
|
|
"learning_rate": 1.2037037037037036e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14825775.0,
|
|
"reward": 0.16337397694587708,
|
|
"reward_std": 0.1274218112230301,
|
|
"rewards/grpo_reward_func/mean": 0.16337397694587708,
|
|
"rewards/grpo_reward_func/std": 0.15495631098747253,
|
|
"step": 588
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.203125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.375,
|
|
"kl": 0.0011066117731388658,
|
|
"learning_rate": 1.111111111111111e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14850807.0,
|
|
"reward": 0.3153996467590332,
|
|
"reward_std": 0.12657006084918976,
|
|
"rewards/grpo_reward_func/mean": 0.3153996467590332,
|
|
"rewards/grpo_reward_func/std": 0.17775346338748932,
|
|
"step": 589
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.21875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.375,
|
|
"kl": 0.0005893185880267993,
|
|
"learning_rate": 1.0185185185185186e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 14876407.0,
|
|
"reward": 0.07492673397064209,
|
|
"reward_std": 0.09368358552455902,
|
|
"rewards/grpo_reward_func/mean": 0.07492673397064209,
|
|
"rewards/grpo_reward_func/std": 0.09934189170598984,
|
|
"step": 590
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.234375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.8125,
|
|
"kl": 0.0015896050026640296,
|
|
"learning_rate": 9.259259259259259e-09,
|
|
"loss": 0.0001,
|
|
"num_tokens": 14901791.0,
|
|
"reward": 0.21189236640930176,
|
|
"reward_std": 0.20999035239219666,
|
|
"rewards/grpo_reward_func/mean": 0.21189236640930176,
|
|
"rewards/grpo_reward_func/std": 0.22601766884326935,
|
|
"step": 591
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.25,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.5,
|
|
"kl": 0.0007123357499949634,
|
|
"learning_rate": 8.333333333333334e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 14926863.0,
|
|
"reward": 0.4173913598060608,
|
|
"reward_std": 0.10256533324718475,
|
|
"rewards/grpo_reward_func/mean": 0.4173913598060608,
|
|
"rewards/grpo_reward_func/std": 0.11102120578289032,
|
|
"step": 592
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.265625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.9375,
|
|
"kl": 0.0007044219819363207,
|
|
"learning_rate": 7.407407407407407e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 14952511.0,
|
|
"reward": 0.20125332474708557,
|
|
"reward_std": 0.11327949911355972,
|
|
"rewards/grpo_reward_func/mean": 0.20125332474708557,
|
|
"rewards/grpo_reward_func/std": 0.2046782523393631,
|
|
"step": 593
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.28125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.8125,
|
|
"kl": 0.000963706843322143,
|
|
"learning_rate": 6.481481481481481e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 14977439.0,
|
|
"reward": 0.41489923000335693,
|
|
"reward_std": 0.1567956805229187,
|
|
"rewards/grpo_reward_func/mean": 0.41489923000335693,
|
|
"rewards/grpo_reward_func/std": 0.15223918855190277,
|
|
"step": 594
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.296875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.125,
|
|
"kl": 0.0006893044337630272,
|
|
"learning_rate": 5.555555555555555e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 15002111.0,
|
|
"reward": 0.601151704788208,
|
|
"reward_std": 0.12341496348381042,
|
|
"rewards/grpo_reward_func/mean": 0.601151704788208,
|
|
"rewards/grpo_reward_func/std": 0.1384855955839157,
|
|
"step": 595
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.3125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.875,
|
|
"kl": 0.001145336776971817,
|
|
"learning_rate": 4.6296296296296295e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 15027239.0,
|
|
"reward": 0.19861392676830292,
|
|
"reward_std": 0.1082070991396904,
|
|
"rewards/grpo_reward_func/mean": 0.19861392676830292,
|
|
"rewards/grpo_reward_func/std": 0.15457463264465332,
|
|
"step": 596
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.328125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.5625,
|
|
"kl": 0.0013100424548611045,
|
|
"learning_rate": 3.7037037037037036e-09,
|
|
"loss": 0.0001,
|
|
"num_tokens": 15052759.0,
|
|
"reward": 0.15435832738876343,
|
|
"reward_std": 0.17842328548431396,
|
|
"rewards/grpo_reward_func/mean": 0.15435832738876343,
|
|
"rewards/grpo_reward_func/std": 0.17921017110347748,
|
|
"step": 597
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.34375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.9375,
|
|
"kl": 0.0014240065356716514,
|
|
"learning_rate": 2.7777777777777776e-09,
|
|
"loss": 0.0001,
|
|
"num_tokens": 15078943.0,
|
|
"reward": 0.0750146359205246,
|
|
"reward_std": 0.1408962607383728,
|
|
"rewards/grpo_reward_func/mean": 0.0750146359205246,
|
|
"rewards/grpo_reward_func/std": 0.14366577565670013,
|
|
"step": 598
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.359375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.75,
|
|
"kl": 0.0006565783696714789,
|
|
"learning_rate": 1.8518518518518518e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 15104375.0,
|
|
"reward": 0.21018442511558533,
|
|
"reward_std": 0.062313079833984375,
|
|
"rewards/grpo_reward_func/mean": 0.21018442511558533,
|
|
"rewards/grpo_reward_func/std": 0.10198579728603363,
|
|
"step": 599
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 9.375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.375,
|
|
"kl": 0.001865773752797395,
|
|
"learning_rate": 9.259259259259259e-10,
|
|
"loss": 0.0001,
|
|
"num_tokens": 15129007.0,
|
|
"reward": 0.5128255486488342,
|
|
"reward_std": 0.1550142616033554,
|
|
"rewards/grpo_reward_func/mean": 0.5128255486488342,
|
|
"rewards/grpo_reward_func/std": 0.1619613915681839,
|
|
"step": 600
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 600,
|
|
"num_input_tokens_seen": 15129007,
|
|
"num_train_epochs": 10,
|
|
"save_steps": 600,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|