Model: cjiao/golden-goose-qwen2.5-1.5b-instruct-stratified-groups Source: Original Platform
1485 lines
54 KiB
JSON
1485 lines
54 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.03571428571428571,
|
|
"eval_steps": 500,
|
|
"global_step": 50,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1696.0,
|
|
"completions/mean_length": 508.0,
|
|
"completions/mean_terminated_length": 458.32257080078125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.14663860481232405,
|
|
"epoch": 0.0007142857142857143,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.91739273071289,
|
|
"learning_rate": 0.0,
|
|
"loss": -0.0,
|
|
"num_tokens": 148816.0,
|
|
"reward": 0.27421873807907104,
|
|
"reward_std": 0.4313132166862488,
|
|
"rewards/format_reward/mean": 0.3984375,
|
|
"rewards/format_reward/std": 0.22146137058734894,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 1,
|
|
"step_time": 171.41765936795855
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1504.0,
|
|
"completions/max_terminated_length": 1504.0,
|
|
"completions/mean_length": 377.046875,
|
|
"completions/mean_terminated_length": 377.046875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.20175037905573845,
|
|
"epoch": 0.0014285714285714286,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.119359016418457,
|
|
"learning_rate": 5.555555555555555e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 255907.0,
|
|
"reward": 0.53125,
|
|
"reward_std": 0.5093957781791687,
|
|
"rewards/format_reward/mean": 0.3125,
|
|
"rewards/format_reward/std": 0.2745848298072815,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 2,
|
|
"step_time": 83.64522138307802
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1984.0,
|
|
"completions/mean_length": 660.625,
|
|
"completions/mean_terminated_length": 638.6032104492188,
|
|
"completions/min_length": 71.0,
|
|
"completions/min_terminated_length": 71.0,
|
|
"entropy": 0.14103460405021906,
|
|
"epoch": 0.002142857142857143,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.2874624729156494,
|
|
"learning_rate": 1.111111111111111e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 381059.0,
|
|
"reward": 0.43281248211860657,
|
|
"reward_std": 0.4954730272293091,
|
|
"rewards/format_reward/mean": 0.421875,
|
|
"rewards/format_reward/std": 0.25539806485176086,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 3,
|
|
"step_time": 131.4170093961293
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1881.0,
|
|
"completions/mean_length": 451.96875,
|
|
"completions/mean_terminated_length": 400.4838562011719,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"entropy": 0.19842500798404217,
|
|
"epoch": 0.002857142857142857,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.316102027893066,
|
|
"learning_rate": 1.6666666666666665e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 483425.0,
|
|
"reward": 0.24921873211860657,
|
|
"reward_std": 0.4258970022201538,
|
|
"rewards/format_reward/mean": 0.3046875,
|
|
"rewards/format_reward/std": 0.2615155577659607,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 4,
|
|
"step_time": 132.2972059249878
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1611.0,
|
|
"completions/max_terminated_length": 1611.0,
|
|
"completions/mean_length": 623.953125,
|
|
"completions/mean_terminated_length": 623.953125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.1606526281684637,
|
|
"epoch": 0.0035714285714285713,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.722296714782715,
|
|
"learning_rate": 2.222222222222222e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 604470.0,
|
|
"reward": 0.39531248807907104,
|
|
"reward_std": 0.4883336126804352,
|
|
"rewards/format_reward/mean": 0.359375,
|
|
"rewards/format_reward/std": 0.2592533528804779,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 5,
|
|
"step_time": 119.59757148602512
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1604.0,
|
|
"completions/mean_length": 401.75,
|
|
"completions/mean_terminated_length": 375.61907958984375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.1858495082706213,
|
|
"epoch": 0.004285714285714286,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.736406326293945,
|
|
"learning_rate": 2.7777777777777776e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 713742.0,
|
|
"reward": 0.33281245827674866,
|
|
"reward_std": 0.4670252799987793,
|
|
"rewards/format_reward/mean": 0.359375,
|
|
"rewards/format_reward/std": 0.2741328477859497,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 6,
|
|
"step_time": 122.30180106399348
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1199.0,
|
|
"completions/max_terminated_length": 1199.0,
|
|
"completions/mean_length": 467.265625,
|
|
"completions/mean_terminated_length": 467.265625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.16867963038384914,
|
|
"epoch": 0.005,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.460102558135986,
|
|
"learning_rate": 3.333333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 813639.0,
|
|
"reward": 0.36796873807907104,
|
|
"reward_std": 0.4780389070510864,
|
|
"rewards/format_reward/mean": 0.3984375,
|
|
"rewards/format_reward/std": 0.31090864539146423,
|
|
"rewards/mcq_exact_match_reward/mean": 0.328125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 7,
|
|
"step_time": 73.72938018315472
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1911.0,
|
|
"completions/mean_length": 508.4375,
|
|
"completions/mean_terminated_length": 484.0000305175781,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.14686184097081423,
|
|
"epoch": 0.005714285714285714,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.620323181152344,
|
|
"learning_rate": 3.888888888888889e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 956587.0,
|
|
"reward": 0.25312498211860657,
|
|
"reward_std": 0.42565304040908813,
|
|
"rewards/format_reward/mean": 0.34375,
|
|
"rewards/format_reward/std": 0.25,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 8,
|
|
"step_time": 147.8402461669757
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1204.0,
|
|
"completions/mean_length": 412.25,
|
|
"completions/mean_terminated_length": 386.2857360839844,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.18391940742731094,
|
|
"epoch": 0.0064285714285714285,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 9.683939933776855,
|
|
"learning_rate": 4.444444444444444e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1075259.0,
|
|
"reward": 0.3140624761581421,
|
|
"reward_std": 0.4561282992362976,
|
|
"rewards/format_reward/mean": 0.328125,
|
|
"rewards/format_reward/std": 0.2847827076911926,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 9,
|
|
"step_time": 108.42918385588564
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1264.0,
|
|
"completions/mean_length": 521.125,
|
|
"completions/mean_terminated_length": 496.888916015625,
|
|
"completions/min_length": 90.0,
|
|
"completions/min_terminated_length": 90.0,
|
|
"entropy": 0.17283021286129951,
|
|
"epoch": 0.007142857142857143,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.3954830169677734,
|
|
"learning_rate": 5e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1193907.0,
|
|
"reward": 0.16171874105930328,
|
|
"reward_std": 0.3368554413318634,
|
|
"rewards/format_reward/mean": 0.3671875,
|
|
"rewards/format_reward/std": 0.23974503576755524,
|
|
"rewards/mcq_exact_match_reward/mean": 0.125,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 10,
|
|
"step_time": 118.48989919497399
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1067.0,
|
|
"completions/max_terminated_length": 1067.0,
|
|
"completions/mean_length": 442.40625,
|
|
"completions/mean_terminated_length": 442.40625,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"entropy": 0.19687055423855782,
|
|
"epoch": 0.007857142857142858,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.561274528503418,
|
|
"learning_rate": 5.555555555555555e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1288341.0,
|
|
"reward": 0.30000001192092896,
|
|
"reward_std": 0.44818857312202454,
|
|
"rewards/format_reward/mean": 0.34375,
|
|
"rewards/format_reward/std": 0.25,
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 11,
|
|
"step_time": 55.07885626098141
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1393.0,
|
|
"completions/mean_length": 442.171875,
|
|
"completions/mean_terminated_length": 416.68255615234375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.22921719774603844,
|
|
"epoch": 0.008571428571428572,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.506412506103516,
|
|
"learning_rate": 6.111111111111112e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1388512.0,
|
|
"reward": 0.3687499761581421,
|
|
"reward_std": 0.48094648122787476,
|
|
"rewards/format_reward/mean": 0.40625,
|
|
"rewards/format_reward/std": 0.33184191584587097,
|
|
"rewards/mcq_exact_match_reward/mean": 0.328125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 12,
|
|
"step_time": 119.49877157399897
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1861.0,
|
|
"completions/mean_length": 471.09375,
|
|
"completions/mean_terminated_length": 420.2257995605469,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.14649338461458683,
|
|
"epoch": 0.009285714285714286,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.722030639648438,
|
|
"learning_rate": 6.666666666666666e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1511798.0,
|
|
"reward": 0.22499997913837433,
|
|
"reward_std": 0.40029749274253845,
|
|
"rewards/format_reward/mean": 0.375,
|
|
"rewards/format_reward/std": 0.26726123690605164,
|
|
"rewards/mcq_exact_match_reward/mean": 0.1875,
|
|
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
|
|
"step": 13,
|
|
"step_time": 160.80015432706568
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1346.0,
|
|
"completions/mean_length": 409.171875,
|
|
"completions/mean_terminated_length": 356.3064270019531,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"entropy": 0.14898105338215828,
|
|
"epoch": 0.01,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 38.745059967041016,
|
|
"learning_rate": 7.222222222222221e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1651729.0,
|
|
"reward": 0.2789062261581421,
|
|
"reward_std": 0.4511716961860657,
|
|
"rewards/format_reward/mean": 0.2890625,
|
|
"rewards/format_reward/std": 0.2789533734321594,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 14,
|
|
"step_time": 193.12235332495766
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1252.0,
|
|
"completions/max_terminated_length": 1252.0,
|
|
"completions/mean_length": 466.265625,
|
|
"completions/mean_terminated_length": 466.265625,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"entropy": 0.19658867083489895,
|
|
"epoch": 0.010714285714285714,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 34.59446334838867,
|
|
"learning_rate": 7.777777777777778e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1746826.0,
|
|
"reward": 0.23281247913837433,
|
|
"reward_std": 0.4091433882713318,
|
|
"rewards/format_reward/mean": 0.296875,
|
|
"rewards/format_reward/std": 0.30496877431869507,
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 15,
|
|
"step_time": 68.50137870694743
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1516.0,
|
|
"completions/mean_length": 417.140625,
|
|
"completions/mean_terminated_length": 364.5322570800781,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.15556670725345612,
|
|
"epoch": 0.011428571428571429,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 6.721803188323975,
|
|
"learning_rate": 8.333333333333333e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1870859.0,
|
|
"reward": 0.34062498807907104,
|
|
"reward_std": 0.4673358201980591,
|
|
"rewards/format_reward/mean": 0.4375,
|
|
"rewards/format_reward/std": 0.3149704039096832,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 16,
|
|
"step_time": 159.5833295909688
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1605.0,
|
|
"completions/mean_length": 444.390625,
|
|
"completions/mean_terminated_length": 392.6612854003906,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.18389248382300138,
|
|
"epoch": 0.012142857142857143,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.25290298461914,
|
|
"learning_rate": 8.888888888888888e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1987036.0,
|
|
"reward": 0.16093748807907104,
|
|
"reward_std": 0.34462639689445496,
|
|
"rewards/format_reward/mean": 0.359375,
|
|
"rewards/format_reward/std": 0.301698237657547,
|
|
"rewards/mcq_exact_match_reward/mean": 0.125,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 17,
|
|
"step_time": 163.66844313696492
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1489.0,
|
|
"completions/mean_length": 497.984375,
|
|
"completions/mean_terminated_length": 447.9838562011719,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"entropy": 0.2151591945439577,
|
|
"epoch": 0.012857142857142857,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 49.3928108215332,
|
|
"learning_rate": 9.444444444444444e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2084659.0,
|
|
"reward": 0.32343748211860657,
|
|
"reward_std": 0.4590334892272949,
|
|
"rewards/format_reward/mean": 0.421875,
|
|
"rewards/format_reward/std": 0.29839184880256653,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 18,
|
|
"step_time": 127.23080269095954
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1315.0,
|
|
"completions/max_terminated_length": 1315.0,
|
|
"completions/mean_length": 358.28125,
|
|
"completions/mean_terminated_length": 358.28125,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"entropy": 0.16461750492453575,
|
|
"epoch": 0.013571428571428571,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 33.24488830566406,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 2189413.0,
|
|
"reward": 0.21406248211860657,
|
|
"reward_std": 0.3865258991718292,
|
|
"rewards/format_reward/mean": 0.421875,
|
|
"rewards/format_reward/std": 0.2221602201461792,
|
|
"rewards/mcq_exact_match_reward/mean": 0.171875,
|
|
"rewards/mcq_exact_match_reward/std": 0.38025420904159546,
|
|
"step": 19,
|
|
"step_time": 81.95023821806535
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1865.0,
|
|
"completions/mean_length": 443.28125,
|
|
"completions/mean_terminated_length": 417.8095397949219,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.16434035263955593,
|
|
"epoch": 0.014285714285714285,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.118695259094238,
|
|
"learning_rate": 9.999776148326214e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2326511.0,
|
|
"reward": 0.42656248807907104,
|
|
"reward_std": 0.48713353276252747,
|
|
"rewards/format_reward/mean": 0.515625,
|
|
"rewards/format_reward/std": 0.1985812783241272,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 20,
|
|
"step_time": 171.49558448110474
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1352.0,
|
|
"completions/max_terminated_length": 1352.0,
|
|
"completions/mean_length": 286.140625,
|
|
"completions/mean_terminated_length": 286.140625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.2154129333794117,
|
|
"epoch": 0.015,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 16.506973266601562,
|
|
"learning_rate": 9.999104613348689e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2431592.0,
|
|
"reward": 0.33203125,
|
|
"reward_std": 0.45828935503959656,
|
|
"rewards/format_reward/mean": 0.5078125,
|
|
"rewards/format_reward/std": 0.18881812691688538,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 21,
|
|
"step_time": 102.2051269490039
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 981.0,
|
|
"completions/max_terminated_length": 981.0,
|
|
"completions/mean_length": 351.015625,
|
|
"completions/mean_terminated_length": 351.015625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.2384468913078308,
|
|
"epoch": 0.015714285714285715,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 8.052404403686523,
|
|
"learning_rate": 9.997985455197113e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2517985.0,
|
|
"reward": 0.20859375596046448,
|
|
"reward_std": 0.37886154651641846,
|
|
"rewards/format_reward/mean": 0.5234375,
|
|
"rewards/format_reward/std": 0.28770697116851807,
|
|
"rewards/mcq_exact_match_reward/mean": 0.15625,
|
|
"rewards/mcq_exact_match_reward/std": 0.36596253514289856,
|
|
"step": 22,
|
|
"step_time": 47.53120892500738
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1639.0,
|
|
"completions/max_terminated_length": 1639.0,
|
|
"completions/mean_length": 493.734375,
|
|
"completions/mean_terminated_length": 493.734375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.15667208284139633,
|
|
"epoch": 0.016428571428571428,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 4.182269096374512,
|
|
"learning_rate": 9.996418774081656e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2643640.0,
|
|
"reward": 0.2679687440395355,
|
|
"reward_std": 0.41770032048225403,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.22699186205863953,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 23,
|
|
"step_time": 136.94159113999922
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1209.0,
|
|
"completions/max_terminated_length": 1209.0,
|
|
"completions/mean_length": 334.75,
|
|
"completions/mean_terminated_length": 334.75,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.2605742085725069,
|
|
"epoch": 0.017142857142857144,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 2.6181070804595947,
|
|
"learning_rate": 9.994404710283998e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2743904.0,
|
|
"reward": 0.08046875149011612,
|
|
"reward_std": 0.17719532549381256,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.24384792149066925,
|
|
"rewards/mcq_exact_match_reward/mean": 0.03125,
|
|
"rewards/mcq_exact_match_reward/std": 0.17536810040473938,
|
|
"step": 24,
|
|
"step_time": 67.75085840100655
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 991.0,
|
|
"completions/max_terminated_length": 991.0,
|
|
"completions/mean_length": 167.71875,
|
|
"completions/mean_terminated_length": 167.71875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.2089465633034706,
|
|
"epoch": 0.017857142857142856,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 21.222145080566406,
|
|
"learning_rate": 9.991943444144756e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2839630.0,
|
|
"reward": 0.3820312023162842,
|
|
"reward_std": 0.4708458185195923,
|
|
"rewards/format_reward/mean": 0.5390625,
|
|
"rewards/format_reward/std": 0.18483558297157288,
|
|
"rewards/mcq_exact_match_reward/mean": 0.328125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 25,
|
|
"step_time": 46.71443971898407
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 692.0,
|
|
"completions/max_terminated_length": 692.0,
|
|
"completions/mean_length": 217.5,
|
|
"completions/mean_terminated_length": 217.5,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.25668232701718807,
|
|
"epoch": 0.018571428571428572,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 17.222293853759766,
|
|
"learning_rate": 9.989035196047348e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2927590.0,
|
|
"reward": 0.16249999403953552,
|
|
"reward_std": 0.3169797956943512,
|
|
"rewards/format_reward/mean": 0.53125,
|
|
"rewards/format_reward/std": 0.25,
|
|
"rewards/mcq_exact_match_reward/mean": 0.109375,
|
|
"rewards/mcq_exact_match_reward/std": 0.3145764470100403,
|
|
"step": 26,
|
|
"step_time": 41.34394903801149
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1824.0,
|
|
"completions/mean_length": 324.53125,
|
|
"completions/mean_terminated_length": 297.17462158203125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.18609545193612576,
|
|
"epoch": 0.019285714285714285,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 17.366985321044922,
|
|
"learning_rate": 9.98568022639826e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3043752.0,
|
|
"reward": 0.28359371423721313,
|
|
"reward_std": 0.431318998336792,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.1406387835741043,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 27,
|
|
"step_time": 145.95341787295183
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1195.0,
|
|
"completions/max_terminated_length": 1195.0,
|
|
"completions/mean_length": 215.234375,
|
|
"completions/mean_terminated_length": 215.234375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.2353730145841837,
|
|
"epoch": 0.02,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 17.270282745361328,
|
|
"learning_rate": 9.981878835603716e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3131783.0,
|
|
"reward": 0.27578121423721313,
|
|
"reward_std": 0.4189927279949188,
|
|
"rewards/format_reward/mean": 0.5703125,
|
|
"rewards/format_reward/std": 0.1751912236213684,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 28,
|
|
"step_time": 48.64892271097051
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 865.0,
|
|
"completions/max_terminated_length": 865.0,
|
|
"completions/mean_length": 214.15625,
|
|
"completions/mean_terminated_length": 214.15625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.2909251060336828,
|
|
"epoch": 0.020714285714285713,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.325923919677734,
|
|
"learning_rate": 9.977631364042794e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3226177.0,
|
|
"reward": 0.4117187261581421,
|
|
"reward_std": 0.4837634861469269,
|
|
"rewards/format_reward/mean": 0.5234375,
|
|
"rewards/format_reward/std": 0.1649840772151947,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 29,
|
|
"step_time": 55.44644622900523
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1167.0,
|
|
"completions/mean_length": 308.875,
|
|
"completions/mean_terminated_length": 281.2698669433594,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.2629696223884821,
|
|
"epoch": 0.02142857142857143,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.076370239257812,
|
|
"learning_rate": 9.972938192036944e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3343833.0,
|
|
"reward": 0.27421873807907104,
|
|
"reward_std": 0.4146132171154022,
|
|
"rewards/format_reward/mean": 0.5546875,
|
|
"rewards/format_reward/std": 0.26899561285972595,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 30,
|
|
"step_time": 177.31874076800887
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 520.0,
|
|
"completions/max_terminated_length": 520.0,
|
|
"completions/mean_length": 236.171875,
|
|
"completions/mean_terminated_length": 236.171875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.2660892754793167,
|
|
"epoch": 0.02214285714285714,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.033559799194336,
|
|
"learning_rate": 9.967799739815924e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3407684.0,
|
|
"reward": 0.4117187261581421,
|
|
"reward_std": 0.4798099994659424,
|
|
"rewards/format_reward/mean": 0.6796875,
|
|
"rewards/format_reward/std": 0.30035942792892456,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 31,
|
|
"step_time": 19.77746521908557
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 762.0,
|
|
"completions/mean_length": 267.140625,
|
|
"completions/mean_terminated_length": 238.87303161621094,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.2127502802759409,
|
|
"epoch": 0.022857142857142857,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 21.78681182861328,
|
|
"learning_rate": 9.96221646748019e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3501853.0,
|
|
"reward": 0.390625,
|
|
"reward_std": 0.47525057196617126,
|
|
"rewards/format_reward/mean": 0.625,
|
|
"rewards/format_reward/std": 0.26726123690605164,
|
|
"rewards/mcq_exact_match_reward/mean": 0.328125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 32,
|
|
"step_time": 118.76398004795192
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1112.0,
|
|
"completions/max_terminated_length": 1112.0,
|
|
"completions/mean_length": 195.171875,
|
|
"completions/mean_terminated_length": 195.171875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.26218850910663605,
|
|
"epoch": 0.023571428571428573,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 13.382572174072266,
|
|
"learning_rate": 9.956188874959686e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3603568.0,
|
|
"reward": 0.19062499701976776,
|
|
"reward_std": 0.3306888937950134,
|
|
"rewards/format_reward/mean": 0.65625,
|
|
"rewards/format_reward/std": 0.25,
|
|
"rewards/mcq_exact_match_reward/mean": 0.125,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 33,
|
|
"step_time": 57.80779201700352
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 888.0,
|
|
"completions/max_terminated_length": 888.0,
|
|
"completions/mean_length": 203.953125,
|
|
"completions/mean_terminated_length": 203.953125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.21508901193737984,
|
|
"epoch": 0.024285714285714285,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 18.717557907104492,
|
|
"learning_rate": 9.949717501969079e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3688533.0,
|
|
"reward": 0.5679687261581421,
|
|
"reward_std": 0.5023252964019775,
|
|
"rewards/format_reward/mean": 0.6796875,
|
|
"rewards/format_reward/std": 0.27265870571136475,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 34,
|
|
"step_time": 64.80360024399124
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1353.0,
|
|
"completions/max_terminated_length": 1353.0,
|
|
"completions/mean_length": 254.78125,
|
|
"completions/mean_terminated_length": 254.78125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.27256614714860916,
|
|
"epoch": 0.025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.232938766479492,
|
|
"learning_rate": 9.942802927959442e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3775567.0,
|
|
"reward": 0.38124996423721313,
|
|
"reward_std": 0.46908387541770935,
|
|
"rewards/format_reward/mean": 0.6875,
|
|
"rewards/format_reward/std": 0.3149704039096832,
|
|
"rewards/mcq_exact_match_reward/mean": 0.3125,
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
|
"step": 35,
|
|
"step_time": 69.50821864098543
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1524.0,
|
|
"completions/max_terminated_length": 1524.0,
|
|
"completions/mean_length": 279.328125,
|
|
"completions/mean_terminated_length": 279.328125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.21617292240262032,
|
|
"epoch": 0.025714285714285714,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 14.788715362548828,
|
|
"learning_rate": 9.93544577206636e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3873788.0,
|
|
"reward": 0.24140623211860657,
|
|
"reward_std": 0.38684260845184326,
|
|
"rewards/format_reward/mean": 0.6953125,
|
|
"rewards/format_reward/std": 0.2762732207775116,
|
|
"rewards/mcq_exact_match_reward/mean": 0.171875,
|
|
"rewards/mcq_exact_match_reward/std": 0.38025420904159546,
|
|
"step": 36,
|
|
"step_time": 112.71978642407339
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1100.0,
|
|
"completions/max_terminated_length": 1100.0,
|
|
"completions/mean_length": 204.046875,
|
|
"completions/mean_terminated_length": 204.046875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.2291890811175108,
|
|
"epoch": 0.02642857142857143,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 22.162996292114258,
|
|
"learning_rate": 9.927646693054495e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3949719.0,
|
|
"reward": 0.43906253576278687,
|
|
"reward_std": 0.4866037666797638,
|
|
"rewards/format_reward/mean": 0.796875,
|
|
"rewards/format_reward/std": 0.2777281701564789,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 37,
|
|
"step_time": 62.183381506067235
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 675.0,
|
|
"completions/max_terminated_length": 675.0,
|
|
"completions/mean_length": 92.578125,
|
|
"completions/mean_terminated_length": 92.578125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.21844332106411457,
|
|
"epoch": 0.027142857142857142,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 21.4798641204834,
|
|
"learning_rate": 9.919406389258606e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4028188.0,
|
|
"reward": 0.44453126192092896,
|
|
"reward_std": 0.486411988735199,
|
|
"rewards/format_reward/mean": 0.6953125,
|
|
"rewards/format_reward/std": 0.2615155577659607,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 38,
|
|
"step_time": 44.90790424309671
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 877.0,
|
|
"completions/max_terminated_length": 877.0,
|
|
"completions/mean_length": 136.25,
|
|
"completions/mean_terminated_length": 136.25,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.2506138999015093,
|
|
"epoch": 0.027857142857142858,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 27.89171028137207,
|
|
"learning_rate": 9.910725598521012e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4097708.0,
|
|
"reward": 0.4664062261581421,
|
|
"reward_std": 0.4868961274623871,
|
|
"rewards/format_reward/mean": 0.9140625,
|
|
"rewards/format_reward/std": 0.19012710452079773,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 39,
|
|
"step_time": 39.88727585604647
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 999.0,
|
|
"completions/max_terminated_length": 999.0,
|
|
"completions/mean_length": 136.984375,
|
|
"completions/mean_terminated_length": 136.984375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.20815920643508434,
|
|
"epoch": 0.02857142857142857,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 22.442346572875977,
|
|
"learning_rate": 9.901605098125526e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4190579.0,
|
|
"reward": 0.38749998807907104,
|
|
"reward_std": 0.4662412703037262,
|
|
"rewards/format_reward/mean": 0.75,
|
|
"rewards/format_reward/std": 0.26726123690605164,
|
|
"rewards/mcq_exact_match_reward/mean": 0.3125,
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
|
"step": 40,
|
|
"step_time": 62.169976764998864
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 512.0,
|
|
"completions/max_terminated_length": 512.0,
|
|
"completions/mean_length": 59.859375,
|
|
"completions/mean_terminated_length": 59.859375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.2938700430095196,
|
|
"epoch": 0.029285714285714286,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 11.49561882019043,
|
|
"learning_rate": 9.892045704727863e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4283034.0,
|
|
"reward": 0.16562500596046448,
|
|
"reward_std": 0.2750000059604645,
|
|
"rewards/format_reward/mean": 0.875,
|
|
"rewards/format_reward/std": 0.2182178944349289,
|
|
"rewards/mcq_exact_match_reward/mean": 0.078125,
|
|
"rewards/mcq_exact_match_reward/std": 0.27048972249031067,
|
|
"step": 41,
|
|
"step_time": 32.6582014990272
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 861.0,
|
|
"completions/max_terminated_length": 861.0,
|
|
"completions/mean_length": 75.765625,
|
|
"completions/mean_terminated_length": 75.765625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.25041171722114086,
|
|
"epoch": 0.03,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 30.742996215820312,
|
|
"learning_rate": 9.882048274282505e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4361843.0,
|
|
"reward": 0.57421875,
|
|
"reward_std": 0.5030062198638916,
|
|
"rewards/format_reward/mean": 0.8984375,
|
|
"rewards/format_reward/std": 0.20275264978408813,
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 42,
|
|
"step_time": 47.323365143092815
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 646.0,
|
|
"completions/max_terminated_length": 646.0,
|
|
"completions/mean_length": 47.390625,
|
|
"completions/mean_terminated_length": 47.390625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.18909681774675846,
|
|
"epoch": 0.030714285714285715,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 22.67137908935547,
|
|
"learning_rate": 9.871613701966066e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4457780.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.485504150390625,
|
|
"rewards/format_reward/mean": 0.84375,
|
|
"rewards/format_reward/std": 0.233588308095932,
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 43,
|
|
"step_time": 47.4528916090494
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 554.0,
|
|
"completions/mean_length": 83.21875,
|
|
"completions/mean_terminated_length": 52.0317497253418,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"entropy": 0.17218941450119019,
|
|
"epoch": 0.03142857142857143,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 12.08895492553711,
|
|
"learning_rate": 9.86074292209714e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4527074.0,
|
|
"reward": 0.628125011920929,
|
|
"reward_std": 0.50661301612854,
|
|
"rewards/format_reward/mean": 0.96875,
|
|
"rewards/format_reward/std": 0.17536810040473938,
|
|
"rewards/mcq_exact_match_reward/mean": 0.53125,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 44,
|
|
"step_time": 107.07343659299659
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 651.0,
|
|
"completions/max_terminated_length": 651.0,
|
|
"completions/mean_length": 36.0,
|
|
"completions/mean_terminated_length": 36.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.18100928142666817,
|
|
"epoch": 0.03214285714285714,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 13.066922187805176,
|
|
"learning_rate": 9.849436908052636e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4608754.0,
|
|
"reward": 0.38203126192092896,
|
|
"reward_std": 0.465761661529541,
|
|
"rewards/format_reward/mean": 0.8515625,
|
|
"rewards/format_reward/std": 0.26246222853660583,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 45,
|
|
"step_time": 30.311806608980987
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 837.0,
|
|
"completions/max_terminated_length": 837.0,
|
|
"completions/mean_length": 67.34375,
|
|
"completions/mean_terminated_length": 67.34375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.19344223476946354,
|
|
"epoch": 0.032857142857142856,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 8.720047950744629,
|
|
"learning_rate": 9.837696672180618e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4691800.0,
|
|
"reward": 0.328125,
|
|
"reward_std": 0.4157489538192749,
|
|
"rewards/format_reward/mean": 0.9375,
|
|
"rewards/format_reward/std": 0.1666666716337204,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 46,
|
|
"step_time": 44.673023908922914
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 509.0,
|
|
"completions/max_terminated_length": 509.0,
|
|
"completions/mean_length": 20.140625,
|
|
"completions/mean_terminated_length": 20.140625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.07546021463349462,
|
|
"epoch": 0.03357142857142857,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 9.505743980407715,
|
|
"learning_rate": 9.825523265709665e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4783617.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.49629583954811096,
|
|
"rewards/format_reward/mean": 0.9375,
|
|
"rewards/format_reward/std": 0.1666666716337204,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 47,
|
|
"step_time": 28.177909465972334
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 190.0,
|
|
"completions/max_terminated_length": 190.0,
|
|
"completions/mean_length": 20.59375,
|
|
"completions/mean_terminated_length": 20.59375,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"entropy": 0.10830738116055727,
|
|
"epoch": 0.03428571428571429,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 14.707826614379883,
|
|
"learning_rate": 9.812917778654747e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4861247.0,
|
|
"reward": 0.43125003576278687,
|
|
"reward_std": 0.4888843894004822,
|
|
"rewards/format_reward/mean": 0.875,
|
|
"rewards/format_reward/std": 0.3333333432674408,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 48,
|
|
"step_time": 10.893696008017287
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 749.0,
|
|
"completions/mean_length": 66.125,
|
|
"completions/mean_terminated_length": 34.66666793823242,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.11854543350636959,
|
|
"epoch": 0.035,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 15.387039184570312,
|
|
"learning_rate": 9.799881339719614e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4968215.0,
|
|
"reward": 0.5382812023162842,
|
|
"reward_std": 0.504876434803009,
|
|
"rewards/format_reward/mean": 0.8515625,
|
|
"rewards/format_reward/std": 0.24688033759593964,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 49,
|
|
"step_time": 154.59990453493083
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1531.0,
|
|
"completions/max_terminated_length": 1531.0,
|
|
"completions/mean_length": 77.125,
|
|
"completions/mean_terminated_length": 77.125,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"entropy": 0.09524603839963675,
|
|
"epoch": 0.03571428571428571,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 10.865472793579102,
|
|
"learning_rate": 9.786415116195732e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5049023.0,
|
|
"reward": 0.44218751788139343,
|
|
"reward_std": 0.47993209958076477,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.08768405020236969,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 50,
|
|
"step_time": 96.41551529400749
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 350,
|
|
"num_input_tokens_seen": 5049023,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 50,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|