5835 lines
209 KiB
JSON
5835 lines
209 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.14285714285714285,
|
|
"eval_steps": 500,
|
|
"global_step": 200,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1221.0,
|
|
"completions/max_terminated_length": 1221.0,
|
|
"completions/mean_length": 429.265625,
|
|
"completions/mean_terminated_length": 429.265625,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"entropy": 0.16631191410124302,
|
|
"epoch": 0.0007142857142857143,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.064172744750977,
|
|
"learning_rate": 0.0,
|
|
"loss": -0.0,
|
|
"num_tokens": 106993.0,
|
|
"reward": 0.3062499761581421,
|
|
"reward_std": 0.44636982679367065,
|
|
"rewards/format_reward/mean": 0.40625,
|
|
"rewards/format_reward/std": 0.279951810836792,
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 1,
|
|
"step_time": 58.5344306009938
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1353.0,
|
|
"completions/max_terminated_length": 1353.0,
|
|
"completions/mean_length": 318.9375,
|
|
"completions/mean_terminated_length": 318.9375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.21383497677743435,
|
|
"epoch": 0.0014285714285714286,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.98560905456543,
|
|
"learning_rate": 5.555555555555555e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 215429.0,
|
|
"reward": 0.1718749850988388,
|
|
"reward_std": 0.3545480966567993,
|
|
"rewards/format_reward/mean": 0.3125,
|
|
"rewards/format_reward/std": 0.2745848298072815,
|
|
"rewards/mcq_exact_match_reward/mean": 0.140625,
|
|
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
|
|
"step": 2,
|
|
"step_time": 88.76631601905683
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1401.0,
|
|
"completions/mean_length": 473.1875,
|
|
"completions/mean_terminated_length": 448.19049072265625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.18973157368600368,
|
|
"epoch": 0.002142857142857143,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 4.975986957550049,
|
|
"learning_rate": 1.111111111111111e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 328561.0,
|
|
"reward": 0.26093751192092896,
|
|
"reward_std": 0.421775221824646,
|
|
"rewards/format_reward/mean": 0.421875,
|
|
"rewards/format_reward/std": 0.2847827076911926,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 3,
|
|
"step_time": 165.18509875505697
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1586.0,
|
|
"completions/mean_length": 572.875,
|
|
"completions/mean_terminated_length": 525.290283203125,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"entropy": 0.16460688412189484,
|
|
"epoch": 0.002857142857142857,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.808328628540039,
|
|
"learning_rate": 1.6666666666666665e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 457417.0,
|
|
"reward": 0.3304687440395355,
|
|
"reward_std": 0.4703609347343445,
|
|
"rewards/format_reward/mean": 0.3359375,
|
|
"rewards/format_reward/std": 0.29620200395584106,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 4,
|
|
"step_time": 144.26409805112053
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1520.0,
|
|
"completions/max_terminated_length": 1520.0,
|
|
"completions/mean_length": 487.84375,
|
|
"completions/mean_terminated_length": 487.84375,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"entropy": 0.1548436339944601,
|
|
"epoch": 0.0035714285714285713,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.682651996612549,
|
|
"learning_rate": 2.222222222222222e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 581367.0,
|
|
"reward": 0.26640623807907104,
|
|
"reward_std": 0.43771177530288696,
|
|
"rewards/format_reward/mean": 0.3203125,
|
|
"rewards/format_reward/std": 0.27265870571136475,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 5,
|
|
"step_time": 108.35025227611186
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1734.0,
|
|
"completions/max_terminated_length": 1734.0,
|
|
"completions/mean_length": 504.1875,
|
|
"completions/mean_terminated_length": 504.1875,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"entropy": 0.15498177334666252,
|
|
"epoch": 0.004285714285714286,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 19.60171890258789,
|
|
"learning_rate": 2.7777777777777776e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 697603.0,
|
|
"reward": 0.15312498807907104,
|
|
"reward_std": 0.3424786627292633,
|
|
"rewards/format_reward/mean": 0.28125,
|
|
"rewards/format_reward/std": 0.25,
|
|
"rewards/mcq_exact_match_reward/mean": 0.125,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 6,
|
|
"step_time": 101.33141891699051
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1490.0,
|
|
"completions/max_terminated_length": 1490.0,
|
|
"completions/mean_length": 490.046875,
|
|
"completions/mean_terminated_length": 490.046875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.16023985855281353,
|
|
"epoch": 0.005,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.83193302154541,
|
|
"learning_rate": 3.333333333333333e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 815102.0,
|
|
"reward": 0.21953123807907104,
|
|
"reward_std": 0.4048923850059509,
|
|
"rewards/format_reward/mean": 0.3203125,
|
|
"rewards/format_reward/std": 0.27265870571136475,
|
|
"rewards/mcq_exact_match_reward/mean": 0.1875,
|
|
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
|
|
"step": 7,
|
|
"step_time": 91.50709407392424
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 898.0,
|
|
"completions/max_terminated_length": 898.0,
|
|
"completions/mean_length": 304.890625,
|
|
"completions/mean_terminated_length": 304.890625,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"entropy": 0.24000362865626812,
|
|
"epoch": 0.005714285714285714,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.5482072830200195,
|
|
"learning_rate": 3.888888888888889e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 898271.0,
|
|
"reward": 0.37343746423721313,
|
|
"reward_std": 0.4891658127307892,
|
|
"rewards/format_reward/mean": 0.296875,
|
|
"rewards/format_reward/std": 0.2630521357059479,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 8,
|
|
"step_time": 39.65540377004072
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1663.0,
|
|
"completions/mean_length": 562.890625,
|
|
"completions/mean_terminated_length": 539.3175048828125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.16113737598061562,
|
|
"epoch": 0.0064285714285714285,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 4.5696187019348145,
|
|
"learning_rate": 4.444444444444444e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1037680.0,
|
|
"reward": 0.3976562023162842,
|
|
"reward_std": 0.48798495531082153,
|
|
"rewards/format_reward/mean": 0.3828125,
|
|
"rewards/format_reward/std": 0.21347814798355103,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 9,
|
|
"step_time": 141.6887187999091
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1540.0,
|
|
"completions/mean_length": 482.828125,
|
|
"completions/mean_terminated_length": 457.9841613769531,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.13189083151519299,
|
|
"epoch": 0.007142857142857143,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.855482578277588,
|
|
"learning_rate": 5e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1177501.0,
|
|
"reward": 0.2890625,
|
|
"reward_std": 0.44351306557655334,
|
|
"rewards/format_reward/mean": 0.390625,
|
|
"rewards/format_reward/std": 0.2592533528804779,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 10,
|
|
"step_time": 166.28699472307926
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1330.0,
|
|
"completions/max_terminated_length": 1330.0,
|
|
"completions/mean_length": 475.71875,
|
|
"completions/mean_terminated_length": 475.71875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.1734474077820778,
|
|
"epoch": 0.007857142857142858,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.398857593536377,
|
|
"learning_rate": 5.555555555555555e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1285235.0,
|
|
"reward": 0.3296874761581421,
|
|
"reward_std": 0.46408456563949585,
|
|
"rewards/format_reward/mean": 0.328125,
|
|
"rewards/format_reward/std": 0.29839184880256653,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 11,
|
|
"step_time": 84.22253790113609
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1220.0,
|
|
"completions/mean_length": 472.0,
|
|
"completions/mean_terminated_length": 421.1612854003906,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.1755824889987707,
|
|
"epoch": 0.008571428571428572,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.043524265289307,
|
|
"learning_rate": 6.111111111111112e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1402707.0,
|
|
"reward": 0.3609374761581421,
|
|
"reward_std": 0.4861958622932434,
|
|
"rewards/format_reward/mean": 0.328125,
|
|
"rewards/format_reward/std": 0.31140682101249695,
|
|
"rewards/mcq_exact_match_reward/mean": 0.328125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 12,
|
|
"step_time": 147.3180411880021
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1790.0,
|
|
"completions/max_terminated_length": 1790.0,
|
|
"completions/mean_length": 390.375,
|
|
"completions/mean_terminated_length": 390.375,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"entropy": 0.18040168471634388,
|
|
"epoch": 0.009285714285714286,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.297653198242188,
|
|
"learning_rate": 6.666666666666666e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1499635.0,
|
|
"reward": 0.34296876192092896,
|
|
"reward_std": 0.4763341546058655,
|
|
"rewards/format_reward/mean": 0.4609375,
|
|
"rewards/format_reward/std": 0.37059250473976135,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 13,
|
|
"step_time": 114.5199738269439
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1307.0,
|
|
"completions/mean_length": 537.03125,
|
|
"completions/mean_terminated_length": 488.2903137207031,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.17947025410830975,
|
|
"epoch": 0.01,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.441692352294922,
|
|
"learning_rate": 7.222222222222221e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1617149.0,
|
|
"reward": 0.1929687261581421,
|
|
"reward_std": 0.3684875965118408,
|
|
"rewards/format_reward/mean": 0.3671875,
|
|
"rewards/format_reward/std": 0.28510910272598267,
|
|
"rewards/mcq_exact_match_reward/mean": 0.15625,
|
|
"rewards/mcq_exact_match_reward/std": 0.36596253514289856,
|
|
"step": 14,
|
|
"step_time": 151.2594413299812
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1614.0,
|
|
"completions/max_terminated_length": 1614.0,
|
|
"completions/mean_length": 591.671875,
|
|
"completions/mean_terminated_length": 591.671875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.14635740965604782,
|
|
"epoch": 0.010714285714285714,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.28996467590332,
|
|
"learning_rate": 7.777777777777778e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1737472.0,
|
|
"reward": 0.43437498807907104,
|
|
"reward_std": 0.49728426337242126,
|
|
"rewards/format_reward/mean": 0.4375,
|
|
"rewards/format_reward/std": 0.22712838649749756,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 15,
|
|
"step_time": 89.8077824919601
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1520.0,
|
|
"completions/max_terminated_length": 1520.0,
|
|
"completions/mean_length": 394.375,
|
|
"completions/mean_terminated_length": 394.375,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"entropy": 0.14997188560664654,
|
|
"epoch": 0.011428571428571429,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.825170516967773,
|
|
"learning_rate": 8.333333333333333e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1841536.0,
|
|
"reward": 0.44218748807907104,
|
|
"reward_std": 0.5008895993232727,
|
|
"rewards/format_reward/mean": 0.359375,
|
|
"rewards/format_reward/std": 0.24346621334552765,
|
|
"rewards/mcq_exact_match_reward/mean": 0.40625,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 16,
|
|
"step_time": 76.37220047204755
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1665.0,
|
|
"completions/mean_length": 405.4375,
|
|
"completions/mean_terminated_length": 379.3651123046875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.1900953222066164,
|
|
"epoch": 0.012142857142857143,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 4.9255475997924805,
|
|
"learning_rate": 8.888888888888888e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1953596.0,
|
|
"reward": 0.29296875,
|
|
"reward_std": 0.444888710975647,
|
|
"rewards/format_reward/mean": 0.4296875,
|
|
"rewards/format_reward/std": 0.26528194546699524,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 17,
|
|
"step_time": 201.1936074459809
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1469.0,
|
|
"completions/max_terminated_length": 1469.0,
|
|
"completions/mean_length": 503.375,
|
|
"completions/mean_terminated_length": 503.375,
|
|
"completions/min_length": 139.0,
|
|
"completions/min_terminated_length": 139.0,
|
|
"entropy": 0.16892211325466633,
|
|
"epoch": 0.012857142857142857,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.7092769145965576,
|
|
"learning_rate": 9.444444444444444e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2075684.0,
|
|
"reward": 0.39374998211860657,
|
|
"reward_std": 0.4857901334762573,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.2182178944349289,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 18,
|
|
"step_time": 85.88162007700885
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1472.0,
|
|
"completions/max_terminated_length": 1472.0,
|
|
"completions/mean_length": 405.21875,
|
|
"completions/mean_terminated_length": 405.21875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.14752909820526838,
|
|
"epoch": 0.013571428571428571,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.623274803161621,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 2192466.0,
|
|
"reward": 0.3187499940395355,
|
|
"reward_std": 0.45114490389823914,
|
|
"rewards/format_reward/mean": 0.375,
|
|
"rewards/format_reward/std": 0.2357022762298584,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 19,
|
|
"step_time": 92.52301802794682
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1522.0,
|
|
"completions/mean_length": 510.96875,
|
|
"completions/mean_terminated_length": 461.3870849609375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.19505906477570534,
|
|
"epoch": 0.014285714285714285,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.067420482635498,
|
|
"learning_rate": 9.999776148326214e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2310224.0,
|
|
"reward": 0.17656250298023224,
|
|
"reward_std": 0.33117976784706116,
|
|
"rewards/format_reward/mean": 0.515625,
|
|
"rewards/format_reward/std": 0.23517554998397827,
|
|
"rewards/mcq_exact_match_reward/mean": 0.125,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 20,
|
|
"step_time": 171.9173401860171
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1016.0,
|
|
"completions/max_terminated_length": 1016.0,
|
|
"completions/mean_length": 405.171875,
|
|
"completions/mean_terminated_length": 405.171875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.20128028839826584,
|
|
"epoch": 0.015,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.80737590789795,
|
|
"learning_rate": 9.999104613348689e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2409627.0,
|
|
"reward": 0.32343748211860657,
|
|
"reward_std": 0.4627358913421631,
|
|
"rewards/format_reward/mean": 0.421875,
|
|
"rewards/format_reward/std": 0.33592742681503296,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 21,
|
|
"step_time": 53.99907385505503
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1459.0,
|
|
"completions/mean_length": 482.375,
|
|
"completions/mean_terminated_length": 457.5238342285156,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.16458906698971987,
|
|
"epoch": 0.015714285714285715,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 4.145486831665039,
|
|
"learning_rate": 9.997985455197113e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2518611.0,
|
|
"reward": 0.35078126192092896,
|
|
"reward_std": 0.46423619985580444,
|
|
"rewards/format_reward/mean": 0.5390625,
|
|
"rewards/format_reward/std": 0.3249503970146179,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 22,
|
|
"step_time": 158.2987147619715
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 971.0,
|
|
"completions/max_terminated_length": 971.0,
|
|
"completions/mean_length": 269.3125,
|
|
"completions/mean_terminated_length": 269.3125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.21560950204730034,
|
|
"epoch": 0.016428571428571428,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 27.24405288696289,
|
|
"learning_rate": 9.996418774081656e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2592975.0,
|
|
"reward": 0.41484373807907104,
|
|
"reward_std": 0.4857231378555298,
|
|
"rewards/format_reward/mean": 0.7109375,
|
|
"rewards/format_reward/std": 0.2928335666656494,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 23,
|
|
"step_time": 46.8515038289479
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1421.0,
|
|
"completions/max_terminated_length": 1421.0,
|
|
"completions/mean_length": 252.953125,
|
|
"completions/mean_terminated_length": 252.953125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.19199753180146217,
|
|
"epoch": 0.017142857142857144,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 17.405986785888672,
|
|
"learning_rate": 9.994404710283998e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2687908.0,
|
|
"reward": 0.3343749940395355,
|
|
"reward_std": 0.4513537883758545,
|
|
"rewards/format_reward/mean": 0.6875,
|
|
"rewards/format_reward/std": 0.3726780116558075,
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 24,
|
|
"step_time": 82.87710704799974
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1449.0,
|
|
"completions/max_terminated_length": 1449.0,
|
|
"completions/mean_length": 164.6875,
|
|
"completions/mean_terminated_length": 164.6875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.2559709567576647,
|
|
"epoch": 0.017857142857142856,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 8.622386932373047,
|
|
"learning_rate": 9.991943444144756e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2777256.0,
|
|
"reward": 0.29296875,
|
|
"reward_std": 0.4255591630935669,
|
|
"rewards/format_reward/mean": 0.7421875,
|
|
"rewards/format_reward/std": 0.2816080152988434,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 25,
|
|
"step_time": 100.27635278215166
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1396.0,
|
|
"completions/max_terminated_length": 1396.0,
|
|
"completions/mean_length": 82.921875,
|
|
"completions/mean_terminated_length": 82.921875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.24555067718029022,
|
|
"epoch": 0.018571428571428572,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 17.675294876098633,
|
|
"learning_rate": 9.989035196047348e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2852203.0,
|
|
"reward": 0.4906250238418579,
|
|
"reward_std": 0.49432292580604553,
|
|
"rewards/format_reward/mean": 0.84375,
|
|
"rewards/format_reward/std": 0.265398770570755,
|
|
"rewards/mcq_exact_match_reward/mean": 0.40625,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 26,
|
|
"step_time": 71.82276537799044
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 536.0,
|
|
"completions/max_terminated_length": 536.0,
|
|
"completions/mean_length": 41.0625,
|
|
"completions/mean_terminated_length": 41.0625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.23947478830814362,
|
|
"epoch": 0.019285714285714285,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 27.987302780151367,
|
|
"learning_rate": 9.98568022639826e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2928439.0,
|
|
"reward": 0.63671875,
|
|
"reward_std": 0.5023154616355896,
|
|
"rewards/format_reward/mean": 0.8984375,
|
|
"rewards/format_reward/std": 0.20275264978408813,
|
|
"rewards/mcq_exact_match_reward/mean": 0.546875,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 27,
|
|
"step_time": 26.626900972041767
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 691.0,
|
|
"completions/max_terminated_length": 691.0,
|
|
"completions/mean_length": 58.015625,
|
|
"completions/mean_terminated_length": 58.015625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.18591530248522758,
|
|
"epoch": 0.02,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 33.29207992553711,
|
|
"learning_rate": 9.981878835603716e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2998840.0,
|
|
"reward": 0.4515625238418579,
|
|
"reward_std": 0.48989540338516235,
|
|
"rewards/format_reward/mean": 0.921875,
|
|
"rewards/format_reward/std": 0.20351573824882507,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 28,
|
|
"step_time": 29.762270515959244
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 574.0,
|
|
"completions/max_terminated_length": 574.0,
|
|
"completions/mean_length": 34.6875,
|
|
"completions/mean_terminated_length": 34.6875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.15785411559045315,
|
|
"epoch": 0.020714285714285713,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 21.5506534576416,
|
|
"learning_rate": 9.977631364042794e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3069804.0,
|
|
"reward": 0.55078125,
|
|
"reward_std": 0.502493143081665,
|
|
"rewards/format_reward/mean": 0.9765625,
|
|
"rewards/format_reward/std": 0.13886408507823944,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 29,
|
|
"step_time": 27.969350750092417
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 497.0,
|
|
"completions/max_terminated_length": 497.0,
|
|
"completions/mean_length": 26.53125,
|
|
"completions/mean_terminated_length": 26.53125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.1635878887027502,
|
|
"epoch": 0.02142857142857143,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 17.759307861328125,
|
|
"learning_rate": 9.972938192036944e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3150486.0,
|
|
"reward": 0.653124988079071,
|
|
"reward_std": 0.49359992146492004,
|
|
"rewards/format_reward/mean": 0.90625,
|
|
"rewards/format_reward/std": 0.19669894874095917,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 30,
|
|
"step_time": 32.165516318927985
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 169.0,
|
|
"completions/max_terminated_length": 169.0,
|
|
"completions/mean_length": 16.28125,
|
|
"completions/mean_terminated_length": 16.28125,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.11400723084807396,
|
|
"epoch": 0.02214285714285714,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 7.3703789710998535,
|
|
"learning_rate": 9.967799739815924e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3208048.0,
|
|
"reward": 0.4437500238418579,
|
|
"reward_std": 0.4787135720252991,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 31,
|
|
"step_time": 6.480815099028405
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 202.0,
|
|
"completions/max_terminated_length": 202.0,
|
|
"completions/mean_length": 19.875,
|
|
"completions/mean_terminated_length": 19.875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.23133844323456287,
|
|
"epoch": 0.022857142857142857,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 14.804015159606934,
|
|
"learning_rate": 9.96221646748019e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3307336.0,
|
|
"reward": 0.21484375,
|
|
"reward_std": 0.33814147114753723,
|
|
"rewards/format_reward/mean": 0.8984375,
|
|
"rewards/format_reward/std": 0.25479042530059814,
|
|
"rewards/mcq_exact_match_reward/mean": 0.125,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 32,
|
|
"step_time": 15.765849456947763
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 12.140625,
|
|
"completions/mean_terminated_length": 12.140625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.11367706023156643,
|
|
"epoch": 0.023571428571428573,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 25.840938568115234,
|
|
"learning_rate": 9.956188874959686e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3388185.0,
|
|
"reward": 0.36328125,
|
|
"reward_std": 0.44670239090919495,
|
|
"rewards/format_reward/mean": 0.9765625,
|
|
"rewards/format_reward/std": 0.10652101784944534,
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 33,
|
|
"step_time": 3.9221740990760736
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 97.0,
|
|
"completions/max_terminated_length": 97.0,
|
|
"completions/mean_length": 14.109375,
|
|
"completions/mean_terminated_length": 14.109375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.12458794936537743,
|
|
"epoch": 0.024285714285714285,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 14.921568870544434,
|
|
"learning_rate": 9.949717501969079e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3454872.0,
|
|
"reward": 0.5679687857627869,
|
|
"reward_std": 0.5037452578544617,
|
|
"rewards/format_reward/mean": 0.9921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 34,
|
|
"step_time": 5.632602730009239
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 147.0,
|
|
"completions/max_terminated_length": 147.0,
|
|
"completions/mean_length": 15.34375,
|
|
"completions/mean_terminated_length": 15.34375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.11258962377905846,
|
|
"epoch": 0.025,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 6.26678466796875,
|
|
"learning_rate": 9.942802927959442e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3534958.0,
|
|
"reward": 0.5375000238418579,
|
|
"reward_std": 0.5,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.4375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 35,
|
|
"step_time": 9.37483271205565
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 12.328125,
|
|
"completions/mean_terminated_length": 12.328125,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.13397582434117794,
|
|
"epoch": 0.025714285714285714,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 6.364090442657471,
|
|
"learning_rate": 9.93544577206636e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3597171.0,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 36,
|
|
"step_time": 3.0314099779934622
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 51.0,
|
|
"completions/max_terminated_length": 51.0,
|
|
"completions/mean_length": 13.078125,
|
|
"completions/mean_terminated_length": 13.078125,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.11545340903103352,
|
|
"epoch": 0.02642857142857143,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 11.611798286437988,
|
|
"learning_rate": 9.927646693054495e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3696696.0,
|
|
"reward": 0.3656250238418579,
|
|
"reward_std": 0.44515693187713623,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 37,
|
|
"step_time": 8.566134120046627
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 12.484375,
|
|
"completions/mean_terminated_length": 12.484375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.10600519925355911,
|
|
"epoch": 0.027142857142857142,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 7.415499210357666,
|
|
"learning_rate": 9.919406389258606e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3774983.0,
|
|
"reward": 0.4437500238418579,
|
|
"reward_std": 0.4787135720252991,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 38,
|
|
"step_time": 3.5948679719585925
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 12.390625,
|
|
"completions/mean_terminated_length": 12.390625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.11269045062363148,
|
|
"epoch": 0.027857142857142858,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 7.707824230194092,
|
|
"learning_rate": 9.910725598521012e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3879416.0,
|
|
"reward": 0.7093750238418579,
|
|
"reward_std": 0.4917473793029785,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 39,
|
|
"step_time": 6.437070619082078
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 65.0,
|
|
"completions/max_terminated_length": 65.0,
|
|
"completions/mean_length": 14.71875,
|
|
"completions/mean_terminated_length": 14.71875,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.1085408478975296,
|
|
"epoch": 0.02857142857142857,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 11.853015899658203,
|
|
"learning_rate": 9.901605098125526e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3949438.0,
|
|
"reward": 0.7085937857627869,
|
|
"reward_std": 0.49115630984306335,
|
|
"rewards/format_reward/mean": 0.9921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 40,
|
|
"step_time": 6.119476022082381
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 233.0,
|
|
"completions/max_terminated_length": 233.0,
|
|
"completions/mean_length": 19.484375,
|
|
"completions/mean_terminated_length": 19.484375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.09150838013738394,
|
|
"epoch": 0.029285714285714286,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.892045704727863e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4034149.0,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 41,
|
|
"step_time": 15.748524485970847
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 164.0,
|
|
"completions/max_terminated_length": 164.0,
|
|
"completions/mean_length": 16.890625,
|
|
"completions/mean_terminated_length": 16.890625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.09939474705606699,
|
|
"epoch": 0.03,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 7.690636157989502,
|
|
"learning_rate": 9.882048274282505e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4101726.0,
|
|
"reward": 0.39531251788139343,
|
|
"reward_std": 0.4616841673851013,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.125,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 42,
|
|
"step_time": 9.772893431887496
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 164.0,
|
|
"completions/mean_length": 61.640625,
|
|
"completions/mean_terminated_length": 30.111112594604492,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.11599440686404705,
|
|
"epoch": 0.030714285714285715,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 14.23160457611084,
|
|
"learning_rate": 9.871613701966066e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4184607.0,
|
|
"reward": 0.5335937738418579,
|
|
"reward_std": 0.5037994384765625,
|
|
"rewards/format_reward/mean": 0.9609375,
|
|
"rewards/format_reward/std": 0.18483558297157288,
|
|
"rewards/mcq_exact_match_reward/mean": 0.4375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 43,
|
|
"step_time": 144.26547849614872
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 399.0,
|
|
"completions/mean_length": 90.21875,
|
|
"completions/mean_terminated_length": 27.064516067504883,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.09353543492034078,
|
|
"epoch": 0.03142857142857143,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 19.40498924255371,
|
|
"learning_rate": 9.86074292209714e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4253973.0,
|
|
"reward": 0.581250011920929,
|
|
"reward_std": 0.5070533752441406,
|
|
"rewards/format_reward/mean": 0.96875,
|
|
"rewards/format_reward/std": 0.17536810040473938,
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 44,
|
|
"step_time": 122.23275292402832
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 216.0,
|
|
"completions/max_terminated_length": 216.0,
|
|
"completions/mean_length": 17.015625,
|
|
"completions/mean_terminated_length": 17.015625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.07891270238906145,
|
|
"epoch": 0.03214285714285714,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 6.091732501983643,
|
|
"learning_rate": 9.849436908052636e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4362886.0,
|
|
"reward": 0.69140625,
|
|
"reward_std": 0.49646008014678955,
|
|
"rewards/format_reward/mean": 0.9765625,
|
|
"rewards/format_reward/std": 0.13886408507823944,
|
|
"rewards/mcq_exact_match_reward/mean": 0.59375,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 45,
|
|
"step_time": 16.195965035120025
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 349.0,
|
|
"completions/max_terminated_length": 349.0,
|
|
"completions/mean_length": 31.546875,
|
|
"completions/mean_terminated_length": 31.546875,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.1003469587303698,
|
|
"epoch": 0.032857142857142856,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 13.104848861694336,
|
|
"learning_rate": 9.837696672180618e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4433921.0,
|
|
"reward": 0.7718750238418579,
|
|
"reward_std": 0.4732423424720764,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.671875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 46,
|
|
"step_time": 26.342010580934584
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 81.0,
|
|
"completions/max_terminated_length": 81.0,
|
|
"completions/mean_length": 21.015625,
|
|
"completions/mean_terminated_length": 21.015625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.09486840199679136,
|
|
"epoch": 0.03357142857142857,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 5.788740158081055,
|
|
"learning_rate": 9.825523265709665e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4514394.0,
|
|
"reward": 0.8460937738418579,
|
|
"reward_std": 0.44179511070251465,
|
|
"rewards/format_reward/mean": 0.9609375,
|
|
"rewards/format_reward/std": 0.18483558297157288,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 47,
|
|
"step_time": 6.866083464003168
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 188.0,
|
|
"completions/max_terminated_length": 188.0,
|
|
"completions/mean_length": 18.265625,
|
|
"completions/mean_terminated_length": 18.265625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.07957334164530039,
|
|
"epoch": 0.03428571428571429,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 11.120720863342285,
|
|
"learning_rate": 9.812917778654747e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4614995.0,
|
|
"reward": 0.7523437738418579,
|
|
"reward_std": 0.48439374566078186,
|
|
"rewards/format_reward/mean": 0.9609375,
|
|
"rewards/format_reward/std": 0.16194961965084076,
|
|
"rewards/mcq_exact_match_reward/mean": 0.65625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 48,
|
|
"step_time": 13.319389674987178
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 63.0,
|
|
"completions/max_terminated_length": 63.0,
|
|
"completions/mean_length": 13.765625,
|
|
"completions/mean_terminated_length": 13.765625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.05717065744102001,
|
|
"epoch": 0.035,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.799881339719614e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4694860.0,
|
|
"reward": 0.3500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 49,
|
|
"step_time": 6.507926742138807
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 293.0,
|
|
"completions/mean_length": 88.109375,
|
|
"completions/mean_terminated_length": 24.887096405029297,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.09659092174842954,
|
|
"epoch": 0.03571428571428571,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 14.933631896972656,
|
|
"learning_rate": 9.786415116195732e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4779483.0,
|
|
"reward": 0.706250011920929,
|
|
"reward_std": 0.49597588181495667,
|
|
"rewards/format_reward/mean": 0.96875,
|
|
"rewards/format_reward/std": 0.17536810040473938,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 50,
|
|
"step_time": 125.25686850992497
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 354.0,
|
|
"completions/max_terminated_length": 354.0,
|
|
"completions/mean_length": 30.5625,
|
|
"completions/mean_terminated_length": 30.5625,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.09759852662682533,
|
|
"epoch": 0.03642857142857143,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 15.208141326904297,
|
|
"learning_rate": 9.772520313857775e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4884719.0,
|
|
"reward": 0.3812500238418579,
|
|
"reward_std": 0.4531635046005249,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 51,
|
|
"step_time": 21.342308732913807
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 220.0,
|
|
"completions/mean_length": 51.109375,
|
|
"completions/mean_terminated_length": 19.41269874572754,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"entropy": 0.05151955410838127,
|
|
"epoch": 0.037142857142857144,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 10.268356323242188,
|
|
"learning_rate": 9.758198176855646e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4979958.0,
|
|
"reward": 0.6609375476837158,
|
|
"reward_std": 0.5019382238388062,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.125,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 52,
|
|
"step_time": 175.891883687058
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.06619986798614264,
|
|
"epoch": 0.03785714285714286,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 19.621379852294922,
|
|
"learning_rate": 9.74344998760308e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5049110.0,
|
|
"reward": 0.5523437857627869,
|
|
"reward_std": 0.5009062886238098,
|
|
"rewards/format_reward/mean": 0.9921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 53,
|
|
"step_time": 4.150718963937834
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.03762774192728102,
|
|
"epoch": 0.03857142857142857,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 20.882497787475586,
|
|
"learning_rate": 9.72827706666282e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5112462.0,
|
|
"reward": 0.7093750238418579,
|
|
"reward_std": 0.4917473793029785,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 54,
|
|
"step_time": 3.5128560769953765
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.03650554618798196,
|
|
"epoch": 0.039285714285714285,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 15.54995059967041,
|
|
"learning_rate": 9.712680772628363e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5210902.0,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 55,
|
|
"step_time": 4.476348334981594
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.04692800110206008,
|
|
"epoch": 0.04,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 11.067854881286621,
|
|
"learning_rate": 9.696662502002318e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5293086.0,
|
|
"reward": 0.4125000238418579,
|
|
"reward_std": 0.4671765863895416,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.3125,
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
|
"step": 56,
|
|
"step_time": 4.003949869889766
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 25.0,
|
|
"completions/max_terminated_length": 25.0,
|
|
"completions/mean_length": 13.1875,
|
|
"completions/mean_terminated_length": 13.1875,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0425227633677423,
|
|
"epoch": 0.04071428571428572,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 10.757762908935547,
|
|
"learning_rate": 9.680223689071362e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5387882.0,
|
|
"reward": 0.48906251788139343,
|
|
"reward_std": 0.4931650757789612,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.125,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 57,
|
|
"step_time": 4.320090122986585
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 111.0,
|
|
"completions/max_terminated_length": 111.0,
|
|
"completions/mean_length": 14.53125,
|
|
"completions/mean_terminated_length": 14.53125,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.024435568135231733,
|
|
"epoch": 0.041428571428571426,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 12.87435245513916,
|
|
"learning_rate": 9.663365805777814e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5488564.0,
|
|
"reward": 0.4906250238418579,
|
|
"reward_std": 0.4917473793029785,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 58,
|
|
"step_time": 9.629043828055728
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.010381318046711385,
|
|
"epoch": 0.04214285714285714,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 11.77026081085205,
|
|
"learning_rate": 9.646090361587827e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5551852.0,
|
|
"reward": 1.021875023841858,
|
|
"reward_std": 0.2704896926879883,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.921875,
|
|
"rewards/mcq_exact_match_reward/std": 0.27048972249031067,
|
|
"step": 59,
|
|
"step_time": 3.6506365661043674
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 24.0,
|
|
"completions/max_terminated_length": 24.0,
|
|
"completions/mean_length": 13.859375,
|
|
"completions/mean_terminated_length": 13.859375,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.024835634976625443,
|
|
"epoch": 0.04285714285714286,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 13.699368476867676,
|
|
"learning_rate": 9.628398903356239e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5633435.0,
|
|
"reward": 0.3109375238418579,
|
|
"reward_std": 0.42168113589286804,
|
|
"rewards/format_reward/mean": 0.921875,
|
|
"rewards/format_reward/std": 0.27048972249031067,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 60,
|
|
"step_time": 5.84596428705845
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.011352454603184015,
|
|
"epoch": 0.04357142857142857,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 24.677173614501953,
|
|
"learning_rate": 9.610293015188067e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5714739.0,
|
|
"reward": 0.8031250238418579,
|
|
"reward_std": 0.46049273014068604,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.703125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 61,
|
|
"step_time": 5.298729537054896
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.01830141741083935,
|
|
"epoch": 0.04428571428571428,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.59177431829666e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5774291.0,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.5039526224136353,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 62,
|
|
"step_time": 3.0780068120220676
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.014526999875670299,
|
|
"epoch": 0.045,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 16.54155731201172,
|
|
"learning_rate": 9.572844470858537e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5850667.0,
|
|
"reward": 0.7406250238418579,
|
|
"reward_std": 0.4836103618144989,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 63,
|
|
"step_time": 4.529202252917457
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.02446594147477299,
|
|
"epoch": 0.045714285714285714,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 27.907894134521484,
|
|
"learning_rate": 9.55350516786491e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5921579.0,
|
|
"reward": 0.6156250238418579,
|
|
"reward_std": 0.5037065148353577,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 64,
|
|
"step_time": 3.6849751479458064
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.027877062326297164,
|
|
"epoch": 0.04642857142857143,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 29.490297317504883,
|
|
"learning_rate": 9.533758140969912e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6020115.0,
|
|
"reward": 0.4906250238418579,
|
|
"reward_std": 0.4917473793029785,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 65,
|
|
"step_time": 6.300093016005121
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.016057265631388873,
|
|
"epoch": 0.047142857142857146,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 18.212966918945312,
|
|
"learning_rate": 9.513605158335562e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6123723.0,
|
|
"reward": 0.6468750238418579,
|
|
"reward_std": 0.501733124256134,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.546875,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 66,
|
|
"step_time": 4.889689573086798
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.008832770312437788,
|
|
"epoch": 0.047857142857142855,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.493048024473411e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6190811.0,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.5039526224136353,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 67,
|
|
"step_time": 3.4412925080396235
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.01757131062913686,
|
|
"epoch": 0.04857142857142857,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 16.572786331176758,
|
|
"learning_rate": 9.47208858008299e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6293939.0,
|
|
"reward": 0.5531250238418579,
|
|
"reward_std": 0.501733124256134,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 68,
|
|
"step_time": 5.069954339065589
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.010598326101899147,
|
|
"epoch": 0.04928571428571429,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.450728701886983e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6360843.0,
|
|
"reward": 0.9750000238418579,
|
|
"reward_std": 0.3333333432674408,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.875,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 69,
|
|
"step_time": 3.045385909965262
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.03748940723016858,
|
|
"epoch": 0.05,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 23.385046005249023,
|
|
"learning_rate": 9.428970302463184e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6435987.0,
|
|
"reward": 0.5375000238418579,
|
|
"reward_std": 0.5,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.4375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 70,
|
|
"step_time": 4.197044083906803
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.011510996264405549,
|
|
"epoch": 0.05071428571428571,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 50.076045989990234,
|
|
"learning_rate": 9.406815330073244e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6503731.0,
|
|
"reward": 0.3812500238418579,
|
|
"reward_std": 0.4531635046005249,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 71,
|
|
"step_time": 4.2282736750203185
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.031134499120526016,
|
|
"epoch": 0.05142857142857143,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.384265768488224e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6611515.0,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.5039526224136353,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 72,
|
|
"step_time": 5.246204287977889
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.026873694732785225,
|
|
"epoch": 0.052142857142857144,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 22.986095428466797,
|
|
"learning_rate": 9.36132363681097e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6719259.0,
|
|
"reward": 0.5843750238418579,
|
|
"reward_std": 0.5037065148353577,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 73,
|
|
"step_time": 5.210786890995223
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.00942779217439238,
|
|
"epoch": 0.05285714285714286,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 11.122662544250488,
|
|
"learning_rate": 9.337990989295304e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6794507.0,
|
|
"reward": 0.7718750238418579,
|
|
"reward_std": 0.4732423424720764,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.671875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 74,
|
|
"step_time": 3.724674280034378
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.01156144640117418,
|
|
"epoch": 0.05357142857142857,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 6.401269912719727,
|
|
"learning_rate": 9.314269915162114e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6863011.0,
|
|
"reward": 0.7406250238418579,
|
|
"reward_std": 0.4836103618144989,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 75,
|
|
"step_time": 3.4731274269870482
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.01271917184931226,
|
|
"epoch": 0.054285714285714284,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 14.784469604492188,
|
|
"learning_rate": 9.290162538412255e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6941691.0,
|
|
"reward": 0.6781250238418579,
|
|
"reward_std": 0.49776285886764526,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.578125,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 76,
|
|
"step_time": 3.031989069073461
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 13.015625,
|
|
"completions/mean_terminated_length": 13.015625,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.010391183634055778,
|
|
"epoch": 0.055,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 4.5144572257995605,
|
|
"learning_rate": 9.265671017636382e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7022972.0,
|
|
"reward": 1.084375023841858,
|
|
"reward_std": 0.125,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.984375,
|
|
"rewards/mcq_exact_match_reward/std": 0.125,
|
|
"step": 77,
|
|
"step_time": 3.7917707841843367
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0105912602157332,
|
|
"epoch": 0.055714285714285716,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 20.127111434936523,
|
|
"learning_rate": 9.240797545821666e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7095780.0,
|
|
"reward": 0.6312500238418579,
|
|
"reward_std": 0.502967357635498,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.53125,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 78,
|
|
"step_time": 3.8805220928625204
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.03518920624628663,
|
|
"epoch": 0.056428571428571425,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 13.436991691589355,
|
|
"learning_rate": 9.215544350155422e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7178476.0,
|
|
"reward": 0.3343750238418579,
|
|
"reward_std": 0.42695629596710205,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 79,
|
|
"step_time": 3.431473600969184
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.016191222646739334,
|
|
"epoch": 0.05714285714285714,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 5.861770153045654,
|
|
"learning_rate": 9.189913691825699e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 7261244.0,
|
|
"reward": 0.6156250238418579,
|
|
"reward_std": 0.5037065148353577,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 80,
|
|
"step_time": 3.8818058649194427
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.029405928449705243,
|
|
"epoch": 0.05785714285714286,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 9.366527557373047,
|
|
"learning_rate": 9.163907865818806e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 7359540.0,
|
|
"reward": 0.8187500238418579,
|
|
"reward_std": 0.4531635046005249,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.71875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 81,
|
|
"step_time": 5.645089631958399
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.01334192231297493,
|
|
"epoch": 0.05857142857142857,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.137529200713809e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7448924.0,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 82,
|
|
"step_time": 5.150634653924499
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.02549466968048364,
|
|
"epoch": 0.05928571428571429,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 19.77663803100586,
|
|
"learning_rate": 9.11078005847405e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7518348.0,
|
|
"reward": 0.5062500238418579,
|
|
"reward_std": 0.49501484632492065,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.40625,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 83,
|
|
"step_time": 3.094080194074195
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.003772719392145518,
|
|
"epoch": 0.06,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.083662834235629e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7595364.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 84,
|
|
"step_time": 3.666810050024651
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.004507055869908072,
|
|
"epoch": 0.060714285714285714,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.056179956092961e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7691668.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 85,
|
|
"step_time": 4.592653869010974
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.006571650264959317,
|
|
"epoch": 0.06142857142857143,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.028333884881356e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7802836.0,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 86,
|
|
"step_time": 5.15318116301205
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.007576553849503398,
|
|
"epoch": 0.062142857142857146,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 6.100246906280518,
|
|
"learning_rate": 9.000127113956672e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 7873540.0,
|
|
"reward": 0.7093750238418579,
|
|
"reward_std": 0.4917473793029785,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 87,
|
|
"step_time": 3.563799056049902
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.03412088053300977,
|
|
"epoch": 0.06285714285714286,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 13.682866096496582,
|
|
"learning_rate": 8.971562168972064e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 7968516.0,
|
|
"reward": 0.5375000238418579,
|
|
"reward_std": 0.5,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.4375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 88,
|
|
"step_time": 4.252024297020398
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.011378830298781395,
|
|
"epoch": 0.06357142857142857,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 40.427642822265625,
|
|
"learning_rate": 8.942641607651828e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 8050940.0,
|
|
"reward": 0.4906250238418579,
|
|
"reward_std": 0.4917473793029785,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 89,
|
|
"step_time": 3.7895749480230734
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.004507646634010598,
|
|
"epoch": 0.06428571428571428,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.91336801956239e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8118580.0,
|
|
"reward": 0.8500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 90,
|
|
"step_time": 4.1695034088916145
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0008988323461380787,
|
|
"epoch": 0.065,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 6.886898040771484,
|
|
"learning_rate": 8.883744025880427e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8209372.0,
|
|
"reward": 0.8343750238418579,
|
|
"reward_std": 0.44515693187713623,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.734375,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 91,
|
|
"step_time": 4.953228909056634
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.010154272997169755,
|
|
"epoch": 0.06571428571428571,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.853772279158165e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8290516.0,
|
|
"reward": 0.9750000238418579,
|
|
"reward_std": 0.3333333432674408,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.875,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 92,
|
|
"step_time": 4.79224861896364
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.028330315952189267,
|
|
"epoch": 0.06642857142857143,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 13.889434814453125,
|
|
"learning_rate": 8.823455463085873e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8369148.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 93,
|
|
"step_time": 5.352049615990836
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.02251565270125866,
|
|
"epoch": 0.06714285714285714,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 15.911195755004883,
|
|
"learning_rate": 8.792796292251559e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8447308.0,
|
|
"reward": 0.6468750238418579,
|
|
"reward_std": 0.501733124256134,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.546875,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 94,
|
|
"step_time": 3.4189756700070575
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.010006657859776169,
|
|
"epoch": 0.06785714285714285,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.761797511897906e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8552556.0,
|
|
"reward": 0.3500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 95,
|
|
"step_time": 5.176017292018514
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.019635747539723525,
|
|
"epoch": 0.06857142857142857,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 4.355508327484131,
|
|
"learning_rate": 8.730461897676463e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8632988.0,
|
|
"reward": 0.9593750238418579,
|
|
"reward_std": 0.3503824472427368,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.859375,
|
|
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
|
|
"step": 96,
|
|
"step_time": 3.6474721890990622
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.016831232234835625,
|
|
"epoch": 0.06928571428571428,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.698792255399103e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8693324.0,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.5039526224136353,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 97,
|
|
"step_time": 2.8727327780216
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.010678260587155819,
|
|
"epoch": 0.07,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 14.87441635131836,
|
|
"learning_rate": 8.666791420786803e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8784164.0,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.5039526224136353,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 98,
|
|
"step_time": 5.402249445091002
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.007853476068703458,
|
|
"epoch": 0.07071428571428572,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 4.313704490661621,
|
|
"learning_rate": 8.634462259215718e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8888620.0,
|
|
"reward": 0.4906250238418579,
|
|
"reward_std": 0.4917473793029785,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 99,
|
|
"step_time": 5.071004737867042
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.023348471499048173,
|
|
"epoch": 0.07142857142857142,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 12.054193496704102,
|
|
"learning_rate": 8.601807665460619e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8994956.0,
|
|
"reward": 0.5531250238418579,
|
|
"reward_std": 0.501733124256134,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 100,
|
|
"step_time": 4.593129857035819
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.013781818954157643,
|
|
"epoch": 0.07214285714285715,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 30.938512802124023,
|
|
"learning_rate": 8.568830563435694e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 9060868.0,
|
|
"reward": 0.6312500238418579,
|
|
"reward_std": 0.502967357635498,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.53125,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 101,
|
|
"step_time": 3.508080493018497
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0011032559996237978,
|
|
"epoch": 0.07285714285714286,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.535533905932737e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9130724.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 102,
|
|
"step_time": 4.060601889912505
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.006481183590949513,
|
|
"epoch": 0.07357142857142857,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 29.389034271240234,
|
|
"learning_rate": 8.501920674356754e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9225292.0,
|
|
"reward": 0.8031250238418579,
|
|
"reward_std": 0.46049273014068604,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.703125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 103,
|
|
"step_time": 4.907652391004376
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.008887886391676147,
|
|
"epoch": 0.07428571428571429,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 5.415668964385986,
|
|
"learning_rate": 8.467993878459003e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9321148.0,
|
|
"reward": 1.084375023841858,
|
|
"reward_std": 0.125,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.984375,
|
|
"rewards/mcq_exact_match_reward/std": 0.125,
|
|
"step": 104,
|
|
"step_time": 6.214851375028957
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.006077070225728676,
|
|
"epoch": 0.075,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 16.421894073486328,
|
|
"learning_rate": 8.433756556067505e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 9410788.0,
|
|
"reward": 0.6156250238418579,
|
|
"reward_std": 0.5037065148353577,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 105,
|
|
"step_time": 3.524499777937308
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0009316421674157027,
|
|
"epoch": 0.07571428571428572,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 7.468184947967529,
|
|
"learning_rate": 8.399211772815029e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9482460.0,
|
|
"reward": 0.8656250238418579,
|
|
"reward_std": 0.42695629596710205,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.765625,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 106,
|
|
"step_time": 3.985870380070992
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.02126017672708258,
|
|
"epoch": 0.07642857142857143,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 17.35074234008789,
|
|
"learning_rate": 8.364362621864594e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9575012.0,
|
|
"reward": 0.7718750238418579,
|
|
"reward_std": 0.4732423424720764,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.671875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 107,
|
|
"step_time": 4.1317150149843656
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0016822229663375765,
|
|
"epoch": 0.07714285714285714,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.32921222363251e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9652836.0,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.5039526224136353,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 108,
|
|
"step_time": 3.163825726893265
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.00396055904275272,
|
|
"epoch": 0.07785714285714286,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.293763725508969e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9710348.0,
|
|
"reward": 0.8500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 109,
|
|
"step_time": 2.513939847005531
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.010725474741775542,
|
|
"epoch": 0.07857142857142857,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.258020301576223e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9789500.0,
|
|
"reward": 0.22500000894069672,
|
|
"reward_std": 0.3333333432674408,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.125,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 110,
|
|
"step_time": 3.739621463988442
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.01487346699286718,
|
|
"epoch": 0.07928571428571428,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.221985152324384e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9864732.0,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.5039526224136353,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 111,
|
|
"step_time": 3.2874150619027205
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 136.0,
|
|
"completions/max_terminated_length": 136.0,
|
|
"completions/mean_length": 14.921875,
|
|
"completions/mean_terminated_length": 14.921875,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.011617353127803653,
|
|
"epoch": 0.08,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 15.174339294433594,
|
|
"learning_rate": 8.185661504364844e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9949247.0,
|
|
"reward": 0.6156250238418579,
|
|
"reward_std": 0.5037065148353577,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 112,
|
|
"step_time": 12.715428045834415
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.014321622533316258,
|
|
"epoch": 0.08071428571428571,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.149052610141355e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10021695.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 113,
|
|
"step_time": 3.706441980903037
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.012922606896609068,
|
|
"epoch": 0.08142857142857143,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 19.292720794677734,
|
|
"learning_rate": 8.112161747638821e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10120495.0,
|
|
"reward": 0.7562500238418579,
|
|
"reward_std": 0.4787135720252991,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.65625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 114,
|
|
"step_time": 6.525050577998627
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.011818792903795838,
|
|
"epoch": 0.08214285714285714,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.074992220089768e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10205991.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 115,
|
|
"step_time": 4.035168804053683
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.01754431438166648,
|
|
"epoch": 0.08285714285714285,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 29.407684326171875,
|
|
"learning_rate": 8.037547355678576e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10278287.0,
|
|
"reward": 0.7875000238418579,
|
|
"reward_std": 0.4671765863895416,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.6875,
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
|
"step": 116,
|
|
"step_time": 3.86815082404064
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.020069130638148636,
|
|
"epoch": 0.08357142857142857,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 11.89538860321045,
|
|
"learning_rate": 7.999830507243477e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 10350751.0,
|
|
"reward": 0.7406250238418579,
|
|
"reward_std": 0.4836103618144989,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 117,
|
|
"step_time": 3.734183361113537
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.013207862619310617,
|
|
"epoch": 0.08428571428571428,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 17.539016723632812,
|
|
"learning_rate": 7.961845051976332e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10437807.0,
|
|
"reward": 0.7718750238418579,
|
|
"reward_std": 0.4732423424720764,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.671875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 118,
|
|
"step_time": 3.960929322929587
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.008311279772897251,
|
|
"epoch": 0.085,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 23.61504554748535,
|
|
"learning_rate": 7.923594391120236e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10513295.0,
|
|
"reward": 0.9593750238418579,
|
|
"reward_std": 0.3503824472427368,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.859375,
|
|
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
|
|
"step": 119,
|
|
"step_time": 3.613634814915713
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.01768041953619104,
|
|
"epoch": 0.08571428571428572,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 7.440960884094238,
|
|
"learning_rate": 7.88508194966497e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 10605735.0,
|
|
"reward": 0.6156250238418579,
|
|
"reward_std": 0.5037065148353577,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 120,
|
|
"step_time": 4.8889303539763205
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.005560076671827119,
|
|
"epoch": 0.08642857142857142,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 10.563973426818848,
|
|
"learning_rate": 7.84631117604033e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 10704791.0,
|
|
"reward": 0.8656250238418579,
|
|
"reward_std": 0.42695629596710205,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.765625,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 121,
|
|
"step_time": 5.653950936044566
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.008863497932907194,
|
|
"epoch": 0.08714285714285715,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 11.157200813293457,
|
|
"learning_rate": 7.80728554180734e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 10782399.0,
|
|
"reward": 0.5687500238418579,
|
|
"reward_std": 0.502967357635498,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 122,
|
|
"step_time": 4.953501323063392
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.022662203875370324,
|
|
"epoch": 0.08785714285714286,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 10.805572509765625,
|
|
"learning_rate": 7.768008541347421e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 10867071.0,
|
|
"reward": 0.8656250238418579,
|
|
"reward_std": 0.42695629596710205,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.765625,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 123,
|
|
"step_time": 5.814674368069973
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.015233667800202966,
|
|
"epoch": 0.08857142857142856,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.728483691549491e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 10967471.0,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 124,
|
|
"step_time": 4.661392649053596
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 13.03125,
|
|
"completions/mean_terminated_length": 13.03125,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0066926556173712015,
|
|
"epoch": 0.08928571428571429,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 11.41643238067627,
|
|
"learning_rate": 7.688714531495059e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11039809.0,
|
|
"reward": 0.971875011920929,
|
|
"reward_std": 0.34201493859291077,
|
|
"rewards/format_reward/mean": 0.96875,
|
|
"rewards/format_reward/std": 0.17536810040473938,
|
|
"rewards/mcq_exact_match_reward/mean": 0.875,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 125,
|
|
"step_time": 3.1613576939562336
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0020889575462206267,
|
|
"epoch": 0.09,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.648704622141347e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11132489.0,
|
|
"reward": 0.8500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 126,
|
|
"step_time": 4.56386918888893
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.013643089099787176,
|
|
"epoch": 0.09071428571428572,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 23.069637298583984,
|
|
"learning_rate": 7.608457546002422e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11226489.0,
|
|
"reward": 0.5375000238418579,
|
|
"reward_std": 0.5,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.4375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 127,
|
|
"step_time": 6.954521012026817
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.007244768785312772,
|
|
"epoch": 0.09142857142857143,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.56797690682843e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11320225.0,
|
|
"reward": 0.8500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 128,
|
|
"step_time": 5.042212332948111
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.011887789936736226,
|
|
"epoch": 0.09214285714285714,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 4.810478687286377,
|
|
"learning_rate": 7.527266329282905e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 11412033.0,
|
|
"reward": 0.7406250238418579,
|
|
"reward_std": 0.4836103618144989,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 129,
|
|
"step_time": 4.549038991041016
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0014612795275752433,
|
|
"epoch": 0.09285714285714286,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.486329458618215e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11491449.0,
|
|
"reward": 0.9750000238418579,
|
|
"reward_std": 0.3333333432674408,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.875,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 130,
|
|
"step_time": 3.1755020670825616
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.021196995279751718,
|
|
"epoch": 0.09357142857142857,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 28.714065551757812,
|
|
"learning_rate": 7.445169960349166e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11589273.0,
|
|
"reward": 0.6937500238418579,
|
|
"reward_std": 0.49501484632492065,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.59375,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 131,
|
|
"step_time": 4.060201975051314
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.026706171571277082,
|
|
"epoch": 0.09428571428571429,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 7.088040828704834,
|
|
"learning_rate": 7.403791519924793e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11670569.0,
|
|
"reward": 0.9125000238418579,
|
|
"reward_std": 0.39339789748191833,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.8125,
|
|
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
|
|
"step": 132,
|
|
"step_time": 3.3845946129295044
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.007905131948064081,
|
|
"epoch": 0.095,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 48.98587417602539,
|
|
"learning_rate": 7.362197842398354e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 11745481.0,
|
|
"reward": 1.053125023841858,
|
|
"reward_std": 0.2130420207977295,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.953125,
|
|
"rewards/mcq_exact_match_reward/std": 0.21304203569889069,
|
|
"step": 133,
|
|
"step_time": 3.9371052511269227
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.003999426597147249,
|
|
"epoch": 0.09571428571428571,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.320392652095583e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11819481.0,
|
|
"reward": 0.9750000238418579,
|
|
"reward_std": 0.3333333432674408,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.875,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 134,
|
|
"step_time": 3.4779590580728836
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.005585818376857787,
|
|
"epoch": 0.09642857142857143,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 18.344497680664062,
|
|
"learning_rate": 7.278379692281208e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 11900009.0,
|
|
"reward": 0.8343750238418579,
|
|
"reward_std": 0.44515693187713623,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.734375,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 135,
|
|
"step_time": 3.6108506870805286
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0216169988270849,
|
|
"epoch": 0.09714285714285714,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 10.213824272155762,
|
|
"learning_rate": 7.236162724823778e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 11999353.0,
|
|
"reward": 0.6312500238418579,
|
|
"reward_std": 0.502967357635498,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.53125,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 136,
|
|
"step_time": 4.899725366034545
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.01557689777109772,
|
|
"epoch": 0.09785714285714285,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 9.55334758758545,
|
|
"learning_rate": 7.193745529858826e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12073633.0,
|
|
"reward": 0.8656250238418579,
|
|
"reward_std": 0.42695629596710205,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.765625,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 137,
|
|
"step_time": 3.584852166008204
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.000477323465020163,
|
|
"epoch": 0.09857142857142857,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.151131905450385e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12156857.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 138,
|
|
"step_time": 4.653962876996957
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 13.078125,
|
|
"completions/mean_terminated_length": 13.078125,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.02223566616885364,
|
|
"epoch": 0.09928571428571428,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 9.778901100158691,
|
|
"learning_rate": 7.10832566725092e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 12249710.0,
|
|
"reward": 0.5843750238418579,
|
|
"reward_std": 0.5037065148353577,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 139,
|
|
"step_time": 5.2385836789617315
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.015097202500328422,
|
|
"epoch": 0.1,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 39.4503059387207,
|
|
"learning_rate": 7.065330648159655e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12319550.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 140,
|
|
"step_time": 3.229677329014521
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.026214689016342163,
|
|
"epoch": 0.10071428571428571,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 11.438697814941406,
|
|
"learning_rate": 7.022150697979384e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12401566.0,
|
|
"reward": 0.8343750238418579,
|
|
"reward_std": 0.44515693187713623,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.734375,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 141,
|
|
"step_time": 3.4784729500534013
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 301.0,
|
|
"completions/max_terminated_length": 301.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 17.5,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.015020072867628187,
|
|
"epoch": 0.10142857142857142,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 14.154157638549805,
|
|
"learning_rate": 6.978789683071759e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12481998.0,
|
|
"reward": 0.7562500238418579,
|
|
"reward_std": 0.4787135720252991,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.65625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 142,
|
|
"step_time": 16.412130515964236
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.003459588740952313,
|
|
"epoch": 0.10214285714285715,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.935251486011086e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12568078.0,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 143,
|
|
"step_time": 4.359226984961424
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.028008434863295406,
|
|
"epoch": 0.10285714285714286,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 23.836597442626953,
|
|
"learning_rate": 6.891540005236674e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12681062.0,
|
|
"reward": 0.5843750238418579,
|
|
"reward_std": 0.5037065148353577,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 144,
|
|
"step_time": 6.600195047038142
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 56.0,
|
|
"completions/max_terminated_length": 56.0,
|
|
"completions/mean_length": 13.671875,
|
|
"completions/mean_terminated_length": 13.671875,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.006988945358898491,
|
|
"epoch": 0.10357142857142858,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 10.568206787109375,
|
|
"learning_rate": 6.847659154703785e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12782745.0,
|
|
"reward": 0.9593750238418579,
|
|
"reward_std": 0.3503824472427368,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.859375,
|
|
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
|
|
"step": 145,
|
|
"step_time": 6.1226727640023455
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.016712593787815422,
|
|
"epoch": 0.10428571428571429,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.803612863533149e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12884537.0,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.5039526224136353,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 146,
|
|
"step_time": 4.518327344034333
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.008153474889695644,
|
|
"epoch": 0.105,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.759405075659165e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12972753.0,
|
|
"reward": 0.9750000238418579,
|
|
"reward_std": 0.3333333432674408,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.875,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 147,
|
|
"step_time": 4.804796123993583
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.00037117511965334415,
|
|
"epoch": 0.10571428571428572,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.715039749476763e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 13049361.0,
|
|
"reward": 0.8500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 148,
|
|
"step_time": 6.387204763945192
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.004881141547230072,
|
|
"epoch": 0.10642857142857143,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.670520857486949e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 13125169.0,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.5039526224136353,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 149,
|
|
"step_time": 3.4786089719855227
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0024245021922979504,
|
|
"epoch": 0.10714285714285714,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.625852385941118e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 13213561.0,
|
|
"reward": 0.8500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 150,
|
|
"step_time": 4.445251900062431
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 52.0,
|
|
"completions/max_terminated_length": 52.0,
|
|
"completions/mean_length": 13.609375,
|
|
"completions/mean_terminated_length": 13.609375,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.012313872866798192,
|
|
"epoch": 0.10785714285714286,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.58103833448412e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 13301696.0,
|
|
"reward": 0.8500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 151,
|
|
"step_time": 6.667017061030492
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.032932347152382135,
|
|
"epoch": 0.10857142857142857,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 8.404014587402344,
|
|
"learning_rate": 6.536082715796124e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 13406568.0,
|
|
"reward": 0.5375000238418579,
|
|
"reward_std": 0.5,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.4375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 152,
|
|
"step_time": 7.167247849982232
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.007070704363286495,
|
|
"epoch": 0.10928571428571429,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 9.130770683288574,
|
|
"learning_rate": 6.490989555233327e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 13484888.0,
|
|
"reward": 0.7875000238418579,
|
|
"reward_std": 0.4671765863895416,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.6875,
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
|
"step": 153,
|
|
"step_time": 5.255881706951186
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.017676749383099377,
|
|
"epoch": 0.11,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 18.37973976135254,
|
|
"learning_rate": 6.445762890467517e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 13561544.0,
|
|
"reward": 0.6937500238418579,
|
|
"reward_std": 0.49501484632492065,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.59375,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 154,
|
|
"step_time": 4.068024636886548
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.008656841120682657,
|
|
"epoch": 0.11071428571428571,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.400406771124535e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 13638768.0,
|
|
"reward": 0.8500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 155,
|
|
"step_time": 4.653675792971626
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.00955948673072271,
|
|
"epoch": 0.11142857142857143,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.354925258421675e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 13731864.0,
|
|
"reward": 0.8500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 156,
|
|
"step_time": 4.129786372825038
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0076383089399314485,
|
|
"epoch": 0.11214285714285714,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 18.645206451416016,
|
|
"learning_rate": 6.309322424804033e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 13804760.0,
|
|
"reward": 0.9593750238418579,
|
|
"reward_std": 0.3503824472427368,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.859375,
|
|
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
|
|
"step": 157,
|
|
"step_time": 4.5105831088731065
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 98.0,
|
|
"completions/max_terminated_length": 98.0,
|
|
"completions/mean_length": 15.203125,
|
|
"completions/mean_terminated_length": 15.203125,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.024698378081666306,
|
|
"epoch": 0.11285714285714285,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 16.9991397857666,
|
|
"learning_rate": 6.263602353579866e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 13892093.0,
|
|
"reward": 0.6781250238418579,
|
|
"reward_std": 0.49776285886764526,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.578125,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 158,
|
|
"step_time": 8.032394355046563
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.013417807058431208,
|
|
"epoch": 0.11357142857142857,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.217769138554959e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 13991829.0,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.5039526224136353,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 159,
|
|
"step_time": 6.180849090975244
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0066935273935087025,
|
|
"epoch": 0.11428571428571428,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 14.025964736938477,
|
|
"learning_rate": 6.171826883666074e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 14080677.0,
|
|
"reward": 0.7093750238418579,
|
|
"reward_std": 0.4917473793029785,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 160,
|
|
"step_time": 5.380768697999883
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.006595649909286294,
|
|
"epoch": 0.115,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 28.2905216217041,
|
|
"learning_rate": 6.12577970261347e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 14169429.0,
|
|
"reward": 0.5062500238418579,
|
|
"reward_std": 0.49501484632492065,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.40625,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 161,
|
|
"step_time": 6.251126443035901
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.019410144712310284,
|
|
"epoch": 0.11571428571428571,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 30.791730880737305,
|
|
"learning_rate": 6.079631718492568e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 14252709.0,
|
|
"reward": 0.7718750238418579,
|
|
"reward_std": 0.4732423424720764,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.671875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 162,
|
|
"step_time": 4.848814221972134
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.00473528123984579,
|
|
"epoch": 0.11642857142857142,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 9.505372047424316,
|
|
"learning_rate": 6.033387063424764e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 14328125.0,
|
|
"reward": 0.7093750238418579,
|
|
"reward_std": 0.4917473793029785,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 163,
|
|
"step_time": 6.175865585100837
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0055000272041070275,
|
|
"epoch": 0.11714285714285715,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.987049878187436e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 14403613.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 164,
|
|
"step_time": 3.767183911986649
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0019281105633126572,
|
|
"epoch": 0.11785714285714285,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.940624311843168e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 14489277.0,
|
|
"reward": 0.8500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 165,
|
|
"step_time": 3.9413183170254342
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.002359322093980154,
|
|
"epoch": 0.11857142857142858,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.894114521368258e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 14552301.0,
|
|
"reward": 0.9750000238418579,
|
|
"reward_std": 0.3333333432674408,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.875,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 166,
|
|
"step_time": 3.563035011989996
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.003409396253118757,
|
|
"epoch": 0.11928571428571429,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.847524671280483e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 14630309.0,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 167,
|
|
"step_time": 3.99354725406738
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.006735146220307797,
|
|
"epoch": 0.12,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 7.990384101867676,
|
|
"learning_rate": 5.800858933266212e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 14725229.0,
|
|
"reward": 0.9593750238418579,
|
|
"reward_std": 0.3503824472427368,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.859375,
|
|
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
|
|
"step": 168,
|
|
"step_time": 8.22251579206204
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.019712560577318072,
|
|
"epoch": 0.12071428571428572,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 56.4944953918457,
|
|
"learning_rate": 5.75412148580687e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 14813061.0,
|
|
"reward": 0.3187500238418579,
|
|
"reward_std": 0.4166666567325592,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 169,
|
|
"step_time": 4.142233877966646
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.010842469346243888,
|
|
"epoch": 0.12142857142857143,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 20.774930953979492,
|
|
"learning_rate": 5.707316513804792e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 14889661.0,
|
|
"reward": 0.8500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 170,
|
|
"step_time": 4.211614910978824
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0063042285037226975,
|
|
"epoch": 0.12214285714285714,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 15.547407150268555,
|
|
"learning_rate": 5.660448208208513e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 14949869.0,
|
|
"reward": 0.6312500238418579,
|
|
"reward_std": 0.502967357635498,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.53125,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 171,
|
|
"step_time": 3.5212799030705355
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.006180563321322552,
|
|
"epoch": 0.12285714285714286,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.613520765637489e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 15014413.0,
|
|
"reward": 0.8500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 172,
|
|
"step_time": 3.4571051630191505
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 13.015625,
|
|
"completions/mean_terminated_length": 13.015625,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.017323336040135473,
|
|
"epoch": 0.12357142857142857,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 31.213979721069336,
|
|
"learning_rate": 5.56653838800635e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 15096078.0,
|
|
"reward": 0.7703125476837158,
|
|
"reward_std": 0.47565484046936035,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.125,
|
|
"rewards/mcq_exact_match_reward/mean": 0.671875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 173,
|
|
"step_time": 5.5480771050206386
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.009160117624560371,
|
|
"epoch": 0.12428571428571429,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 13.76261043548584,
|
|
"learning_rate": 5.519505282148643e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 15184310.0,
|
|
"reward": 0.9125000238418579,
|
|
"reward_std": 0.39339789748191833,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.8125,
|
|
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
|
|
"step": 174,
|
|
"step_time": 4.350894231873099
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0008767848557909019,
|
|
"epoch": 0.125,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.472425659440156e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 15251222.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 175,
|
|
"step_time": 3.4637152470531873
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.029627398937009275,
|
|
"epoch": 0.12571428571428572,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 27.523025512695312,
|
|
"learning_rate": 5.425303735421828e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 15340198.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 176,
|
|
"step_time": 4.828774099005386
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.015928649459965527,
|
|
"epoch": 0.12642857142857142,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 25.298648834228516,
|
|
"learning_rate": 5.378143729422284e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 15439142.0,
|
|
"reward": 0.6156250238418579,
|
|
"reward_std": 0.5037065148353577,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 177,
|
|
"step_time": 4.618877793021966
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0004920928804494906,
|
|
"epoch": 0.12714285714285714,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.330949864180033e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 15518342.0,
|
|
"reward": 0.8500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 178,
|
|
"step_time": 4.00981811794918
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.014561123214662075,
|
|
"epoch": 0.12785714285714286,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 33.129024505615234,
|
|
"learning_rate": 5.28372636546537e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 15594046.0,
|
|
"reward": 0.6937500238418579,
|
|
"reward_std": 0.49501484632492065,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.59375,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 179,
|
|
"step_time": 4.728988194023259
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0006019276152073871,
|
|
"epoch": 0.12857142857142856,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.236477461701985e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 15677166.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 180,
|
|
"step_time": 4.448384342947975
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0006171667537273606,
|
|
"epoch": 0.12928571428571428,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.189207383588352e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 15754846.0,
|
|
"reward": 0.8500000238418579,
|
|
"reward_std": 0.4364357888698578,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 181,
|
|
"step_time": 3.5371177589986473
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.012223656303831376,
|
|
"epoch": 0.13,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.141920363718916e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 15830046.0,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.5039526224136353,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 182,
|
|
"step_time": 4.245069849013817
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0062240139523055404,
|
|
"epoch": 0.13071428571428573,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 31.65149688720703,
|
|
"learning_rate": 5.094620636205095e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 15926630.0,
|
|
"reward": 0.9593750238418579,
|
|
"reward_std": 0.3503824472427368,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.859375,
|
|
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
|
|
"step": 183,
|
|
"step_time": 5.813327364914585
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.008034320664592087,
|
|
"epoch": 0.13142857142857142,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 30.067668914794922,
|
|
"learning_rate": 5.047312436296158e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 15987494.0,
|
|
"reward": 0.6468750238418579,
|
|
"reward_std": 0.501733124256134,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.546875,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 184,
|
|
"step_time": 2.739044851041399
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.01749404292786494,
|
|
"epoch": 0.13214285714285715,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 16058550.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 185,
|
|
"step_time": 2.8902318400796503
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.01119388552615419,
|
|
"epoch": 0.13285714285714287,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 10.966423034667969,
|
|
"learning_rate": 4.952687563703841e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 16143262.0,
|
|
"reward": 0.9437500238418579,
|
|
"reward_std": 0.36596253514289856,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.84375,
|
|
"rewards/mcq_exact_match_reward/std": 0.36596253514289856,
|
|
"step": 186,
|
|
"step_time": 4.150697597942781
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.006861376463348279,
|
|
"epoch": 0.13357142857142856,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 12.331680297851562,
|
|
"learning_rate": 4.905379363794906e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 16214558.0,
|
|
"reward": 0.7718750238418579,
|
|
"reward_std": 0.4732423424720764,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.671875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 187,
|
|
"step_time": 2.8928872388787568
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.007184316560596926,
|
|
"epoch": 0.13428571428571429,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 55.67695999145508,
|
|
"learning_rate": 4.858079636281084e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 16307494.0,
|
|
"reward": 1.037500023841858,
|
|
"reward_std": 0.24397501349449158,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.9375,
|
|
"rewards/mcq_exact_match_reward/std": 0.24397502839565277,
|
|
"step": 188,
|
|
"step_time": 4.562975488952361
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.004760141251608729,
|
|
"epoch": 0.135,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 24.260726928710938,
|
|
"learning_rate": 4.810792616411649e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 16383118.0,
|
|
"reward": 0.8343750238418579,
|
|
"reward_std": 0.44515693187713623,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.734375,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 189,
|
|
"step_time": 3.853932528058067
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.011452905309852213,
|
|
"epoch": 0.1357142857142857,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 12.867086410522461,
|
|
"learning_rate": 4.763522538298017e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 16474206.0,
|
|
"reward": 0.5843750238418579,
|
|
"reward_std": 0.5037065148353577,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 190,
|
|
"step_time": 3.691243421053514
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.004947894281940535,
|
|
"epoch": 0.13642857142857143,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.7162736345346296e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 16559710.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 191,
|
|
"step_time": 4.455151985981502
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.011931511922739446,
|
|
"epoch": 0.13714285714285715,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 10.304251670837402,
|
|
"learning_rate": 4.6690501358199655e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 16667158.0,
|
|
"reward": 0.6625000238418579,
|
|
"reward_std": 0.5,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 192,
|
|
"step_time": 4.7453289949917234
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 600.0,
|
|
"completions/max_terminated_length": 600.0,
|
|
"completions/mean_length": 22.171875,
|
|
"completions/mean_terminated_length": 22.171875,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0034573601587908342,
|
|
"epoch": 0.13785714285714284,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 0.35540762543678284,
|
|
"learning_rate": 4.621856270577718e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 16747961.0,
|
|
"reward": 0.7242187857627869,
|
|
"reward_std": 0.4890054762363434,
|
|
"rewards/format_reward/mean": 0.9921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 193,
|
|
"step_time": 36.201234069943894
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.008329036620125407,
|
|
"epoch": 0.13857142857142857,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.5746962645781723e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 16815801.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 194,
|
|
"step_time": 3.6494918860844336
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.003467444634225103,
|
|
"epoch": 0.1392857142857143,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.5275743405598437e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 16894113.0,
|
|
"reward": 0.9750000238418579,
|
|
"reward_std": 0.3333333432674408,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.875,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 195,
|
|
"step_time": 3.5404545970377512
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.007901970122475177,
|
|
"epoch": 0.14,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 11.832724571228027,
|
|
"learning_rate": 4.480494717851358e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 16975785.0,
|
|
"reward": 0.5718749761581421,
|
|
"reward_std": 0.5168491005897522,
|
|
"rewards/format_reward/mean": 0.875,
|
|
"rewards/format_reward/std": 0.3333333432674408,
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 196,
|
|
"step_time": 4.639674770005513
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.0012349539283604827,
|
|
"epoch": 0.1407142857142857,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.433461611993651e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 17060081.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 197,
|
|
"step_time": 5.708853227144573
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.013591020833700895,
|
|
"epoch": 0.14142857142857143,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 12.822610855102539,
|
|
"learning_rate": 4.3864792343625115e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 17136505.0,
|
|
"reward": 0.7406250238418579,
|
|
"reward_std": 0.4836103618144989,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 198,
|
|
"step_time": 3.695346627966501
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.00963415315709426,
|
|
"epoch": 0.14214285714285715,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 22.993799209594727,
|
|
"learning_rate": 4.3395517917914894e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 17220825.0,
|
|
"reward": 0.8187500238418579,
|
|
"reward_std": 0.4531635046005249,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.71875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 199,
|
|
"step_time": 4.361560025077779
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.013790814788080752,
|
|
"epoch": 0.14285714285714285,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 63.175533294677734,
|
|
"learning_rate": 4.2926834861952077e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 17321257.0,
|
|
"reward": 0.7406250238418579,
|
|
"reward_std": 0.4836103618144989,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 200,
|
|
"step_time": 5.2724235990899615
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 350,
|
|
"num_input_tokens_seen": 17321257,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 50,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|