2935 lines
105 KiB
JSON
2935 lines
105 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.07142857142857142,
|
|
"eval_steps": 500,
|
|
"global_step": 100,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1089.0,
|
|
"completions/mean_length": 453.359375,
|
|
"completions/mean_terminated_length": 428.0476379394531,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.20064565539360046,
|
|
"epoch": 0.0007142857142857143,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 5.157586574554443,
|
|
"learning_rate": 0.0,
|
|
"loss": -0.0,
|
|
"num_tokens": 104207.0,
|
|
"reward": 0.3671875,
|
|
"reward_std": 0.47810959815979004,
|
|
"rewards/format_reward/mean": 0.390625,
|
|
"rewards/format_reward/std": 0.22658175230026245,
|
|
"rewards/mcq_exact_match_reward/mean": 0.328125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 1,
|
|
"step_time": 163.2008375240257
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1690.0,
|
|
"completions/mean_length": 558.921875,
|
|
"completions/mean_terminated_length": 535.2857666015625,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"entropy": 0.15827747248113155,
|
|
"epoch": 0.0014285714285714286,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.5780975818634033,
|
|
"learning_rate": 5.555555555555555e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 224762.0,
|
|
"reward": 0.24296873807907104,
|
|
"reward_std": 0.4135470986366272,
|
|
"rewards/format_reward/mean": 0.3984375,
|
|
"rewards/format_reward/std": 0.28423789143562317,
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 2,
|
|
"step_time": 134.8140414939844
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1883.0,
|
|
"completions/mean_length": 589.8125,
|
|
"completions/mean_terminated_length": 566.6666870117188,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.1299030063673854,
|
|
"epoch": 0.002142857142857143,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.490052223205566,
|
|
"learning_rate": 1.111111111111111e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 346622.0,
|
|
"reward": 0.32109373807907104,
|
|
"reward_std": 0.4618633985519409,
|
|
"rewards/format_reward/mean": 0.3984375,
|
|
"rewards/format_reward/std": 0.2387082874774933,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 3,
|
|
"step_time": 124.3698272620677
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1316.0,
|
|
"completions/max_terminated_length": 1316.0,
|
|
"completions/mean_length": 522.0625,
|
|
"completions/mean_terminated_length": 522.0625,
|
|
"completions/min_length": 141.0,
|
|
"completions/min_terminated_length": 141.0,
|
|
"entropy": 0.16233543679118156,
|
|
"epoch": 0.002857142857142857,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.4298690557479858,
|
|
"learning_rate": 1.6666666666666665e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 460962.0,
|
|
"reward": 0.27656251192092896,
|
|
"reward_std": 0.4340624213218689,
|
|
"rewards/format_reward/mean": 0.421875,
|
|
"rewards/format_reward/std": 0.2847827076911926,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 4,
|
|
"step_time": 90.78269547002856
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1844.0,
|
|
"completions/mean_length": 650.03125,
|
|
"completions/mean_terminated_length": 581.2786865234375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.1047047358006239,
|
|
"epoch": 0.0035714285714285713,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.9983320236206055,
|
|
"learning_rate": 2.222222222222222e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 626636.0,
|
|
"reward": 0.23515623807907104,
|
|
"reward_std": 0.41525280475616455,
|
|
"rewards/format_reward/mean": 0.3203125,
|
|
"rewards/format_reward/std": 0.24180518090724945,
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 5,
|
|
"step_time": 171.21265639393823
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 937.0,
|
|
"completions/max_terminated_length": 937.0,
|
|
"completions/mean_length": 379.109375,
|
|
"completions/mean_terminated_length": 379.109375,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"entropy": 0.17905950360000134,
|
|
"epoch": 0.004285714285714286,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 33.00831604003906,
|
|
"learning_rate": 2.7777777777777776e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 742675.0,
|
|
"reward": 0.22421874105930328,
|
|
"reward_std": 0.39880695939064026,
|
|
"rewards/format_reward/mean": 0.3671875,
|
|
"rewards/format_reward/std": 0.28510910272598267,
|
|
"rewards/mcq_exact_match_reward/mean": 0.1875,
|
|
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
|
|
"step": 6,
|
|
"step_time": 77.91981936100638
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1596.0,
|
|
"completions/max_terminated_length": 1596.0,
|
|
"completions/mean_length": 479.34375,
|
|
"completions/mean_terminated_length": 479.34375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.11762952525168657,
|
|
"epoch": 0.005,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.298130989074707,
|
|
"learning_rate": 3.333333333333333e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 846929.0,
|
|
"reward": 0.4898437261581421,
|
|
"reward_std": 0.5113147497177124,
|
|
"rewards/format_reward/mean": 0.3671875,
|
|
"rewards/format_reward/std": 0.255761981010437,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 7,
|
|
"step_time": 89.83808165497612
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1542.0,
|
|
"completions/max_terminated_length": 1542.0,
|
|
"completions/mean_length": 634.296875,
|
|
"completions/mean_terminated_length": 634.296875,
|
|
"completions/min_length": 77.0,
|
|
"completions/min_terminated_length": 77.0,
|
|
"entropy": 0.158901397138834,
|
|
"epoch": 0.005714285714285714,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.4249234199523926,
|
|
"learning_rate": 3.888888888888889e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 984668.0,
|
|
"reward": 0.3351562023162842,
|
|
"reward_std": 0.465447336435318,
|
|
"rewards/format_reward/mean": 0.3828125,
|
|
"rewards/format_reward/std": 0.2634054720401764,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 8,
|
|
"step_time": 113.26597948698327
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1933.0,
|
|
"completions/max_terminated_length": 1933.0,
|
|
"completions/mean_length": 481.453125,
|
|
"completions/mean_terminated_length": 481.453125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.1585789266973734,
|
|
"epoch": 0.0064285714285714285,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 3.944404125213623,
|
|
"learning_rate": 4.444444444444444e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1116649.0,
|
|
"reward": 0.16718748211860657,
|
|
"reward_std": 0.3370293378829956,
|
|
"rewards/format_reward/mean": 0.421875,
|
|
"rewards/format_reward/std": 0.2221602201461792,
|
|
"rewards/mcq_exact_match_reward/mean": 0.125,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 9,
|
|
"step_time": 132.11293392902007
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1689.0,
|
|
"completions/mean_length": 455.625,
|
|
"completions/mean_terminated_length": 430.3492431640625,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"entropy": 0.15571903064846992,
|
|
"epoch": 0.007142857142857143,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.225871086120605,
|
|
"learning_rate": 5e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1237337.0,
|
|
"reward": 0.29921871423721313,
|
|
"reward_std": 0.45211073756217957,
|
|
"rewards/format_reward/mean": 0.3359375,
|
|
"rewards/format_reward/std": 0.2366211861371994,
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 10,
|
|
"step_time": 141.26959226594772
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1274.0,
|
|
"completions/mean_length": 530.84375,
|
|
"completions/mean_terminated_length": 506.7619323730469,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"entropy": 0.12162926886230707,
|
|
"epoch": 0.007857142857142858,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 26.4554443359375,
|
|
"learning_rate": 5.555555555555555e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1358607.0,
|
|
"reward": 0.23671871423721313,
|
|
"reward_std": 0.4144456684589386,
|
|
"rewards/format_reward/mean": 0.3359375,
|
|
"rewards/format_reward/std": 0.2366211861371994,
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 11,
|
|
"step_time": 136.65451408794615
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1548.0,
|
|
"completions/mean_length": 458.390625,
|
|
"completions/mean_terminated_length": 433.15875244140625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.14663300476968288,
|
|
"epoch": 0.008571428571428572,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.722622871398926,
|
|
"learning_rate": 6.111111111111112e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1471192.0,
|
|
"reward": 0.22499997913837433,
|
|
"reward_std": 0.3999999761581421,
|
|
"rewards/format_reward/mean": 0.375,
|
|
"rewards/format_reward/std": 0.2182178944349289,
|
|
"rewards/mcq_exact_match_reward/mean": 0.1875,
|
|
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
|
|
"step": 12,
|
|
"step_time": 133.89911664801184
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1495.0,
|
|
"completions/mean_length": 562.9375,
|
|
"completions/mean_terminated_length": 539.3651123046875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.15633145160973072,
|
|
"epoch": 0.009285714285714286,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.441755771636963,
|
|
"learning_rate": 6.666666666666666e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1576364.0,
|
|
"reward": 0.25703123211860657,
|
|
"reward_std": 0.42340895533561707,
|
|
"rewards/format_reward/mean": 0.3828125,
|
|
"rewards/format_reward/std": 0.21347814798355103,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 13,
|
|
"step_time": 121.01016625500051
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 2038.0,
|
|
"completions/mean_length": 596.859375,
|
|
"completions/mean_terminated_length": 573.825439453125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.11535925976932049,
|
|
"epoch": 0.01,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.867958068847656,
|
|
"learning_rate": 7.222222222222221e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1739315.0,
|
|
"reward": 0.1874999850988388,
|
|
"reward_std": 0.375224769115448,
|
|
"rewards/format_reward/mean": 0.3125,
|
|
"rewards/format_reward/std": 0.3021090030670166,
|
|
"rewards/mcq_exact_match_reward/mean": 0.15625,
|
|
"rewards/mcq_exact_match_reward/std": 0.36596253514289856,
|
|
"step": 14,
|
|
"step_time": 186.91490571206668
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1531.0,
|
|
"completions/max_terminated_length": 1531.0,
|
|
"completions/mean_length": 396.5,
|
|
"completions/mean_terminated_length": 396.5,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.17848026007413864,
|
|
"epoch": 0.010714285714285714,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.981410503387451,
|
|
"learning_rate": 7.777777777777778e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1836883.0,
|
|
"reward": 0.34843745827674866,
|
|
"reward_std": 0.47941502928733826,
|
|
"rewards/format_reward/mean": 0.359375,
|
|
"rewards/format_reward/std": 0.32694777846336365,
|
|
"rewards/mcq_exact_match_reward/mean": 0.3125,
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
|
"step": 15,
|
|
"step_time": 94.68117612809874
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1874.0,
|
|
"completions/mean_length": 598.296875,
|
|
"completions/mean_terminated_length": 575.2857666015625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.12071683909744024,
|
|
"epoch": 0.011428571428571429,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 17.546281814575195,
|
|
"learning_rate": 8.333333333333333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1974854.0,
|
|
"reward": 0.14687499403953552,
|
|
"reward_std": 0.3447882831096649,
|
|
"rewards/format_reward/mean": 0.21875,
|
|
"rewards/format_reward/std": 0.25,
|
|
"rewards/mcq_exact_match_reward/mean": 0.125,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 16,
|
|
"step_time": 148.51339858904248
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1741.0,
|
|
"completions/mean_length": 611.171875,
|
|
"completions/mean_terminated_length": 564.8225708007812,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.1321120047941804,
|
|
"epoch": 0.012142857142857143,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 5.823922157287598,
|
|
"learning_rate": 8.888888888888888e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2119753.0,
|
|
"reward": 0.17578125,
|
|
"reward_std": 0.3575702905654907,
|
|
"rewards/format_reward/mean": 0.3515625,
|
|
"rewards/format_reward/std": 0.29113471508026123,
|
|
"rewards/mcq_exact_match_reward/mean": 0.140625,
|
|
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
|
|
"step": 17,
|
|
"step_time": 182.10150892706588
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1289.0,
|
|
"completions/mean_length": 559.859375,
|
|
"completions/mean_terminated_length": 511.8548278808594,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.1420492259785533,
|
|
"epoch": 0.012857142857142857,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.969266891479492,
|
|
"learning_rate": 9.444444444444444e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2257376.0,
|
|
"reward": 0.25624996423721313,
|
|
"reward_std": 0.42393654584884644,
|
|
"rewards/format_reward/mean": 0.375,
|
|
"rewards/format_reward/std": 0.2357022762298584,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 18,
|
|
"step_time": 162.68666934006615
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1478.0,
|
|
"completions/max_terminated_length": 1478.0,
|
|
"completions/mean_length": 580.640625,
|
|
"completions/mean_terminated_length": 580.640625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.123278739862144,
|
|
"epoch": 0.013571428571428571,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 13.4293851852417,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 2411657.0,
|
|
"reward": 0.2242187261581421,
|
|
"reward_std": 0.40247172117233276,
|
|
"rewards/format_reward/mean": 0.3671875,
|
|
"rewards/format_reward/std": 0.23974503576755524,
|
|
"rewards/mcq_exact_match_reward/mean": 0.1875,
|
|
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
|
|
"step": 19,
|
|
"step_time": 114.04774017888121
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1521.0,
|
|
"completions/max_terminated_length": 1521.0,
|
|
"completions/mean_length": 439.953125,
|
|
"completions/mean_terminated_length": 439.953125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.10359836835414171,
|
|
"epoch": 0.014285714285714285,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 18.170589447021484,
|
|
"learning_rate": 9.999776148326214e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2542550.0,
|
|
"reward": 0.20546874403953552,
|
|
"reward_std": 0.38863998651504517,
|
|
"rewards/format_reward/mean": 0.3359375,
|
|
"rewards/format_reward/std": 0.2680720090866089,
|
|
"rewards/mcq_exact_match_reward/mean": 0.171875,
|
|
"rewards/mcq_exact_match_reward/std": 0.38025420904159546,
|
|
"step": 20,
|
|
"step_time": 94.10481164290104
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1384.0,
|
|
"completions/mean_length": 583.359375,
|
|
"completions/mean_terminated_length": 511.3278503417969,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.12750612013041973,
|
|
"epoch": 0.015,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 10.771635055541992,
|
|
"learning_rate": 9.999104613348689e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2661437.0,
|
|
"reward": 0.2593749761581421,
|
|
"reward_std": 0.4222835302352905,
|
|
"rewards/format_reward/mean": 0.40625,
|
|
"rewards/format_reward/std": 0.233588308095932,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 21,
|
|
"step_time": 178.62638382194564
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1586.0,
|
|
"completions/mean_length": 381.640625,
|
|
"completions/mean_terminated_length": 355.19049072265625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.15890471823513508,
|
|
"epoch": 0.015714285714285715,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 11.206506729125977,
|
|
"learning_rate": 9.997985455197113e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2776894.0,
|
|
"reward": 0.18593749403953552,
|
|
"reward_std": 0.35270705819129944,
|
|
"rewards/format_reward/mean": 0.453125,
|
|
"rewards/format_reward/std": 0.1717960685491562,
|
|
"rewards/mcq_exact_match_reward/mean": 0.140625,
|
|
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
|
|
"step": 22,
|
|
"step_time": 140.86376020603348
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 899.0,
|
|
"completions/mean_length": 310.015625,
|
|
"completions/mean_terminated_length": 224.5409698486328,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.18053795211017132,
|
|
"epoch": 0.016428571428571428,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 9.23369026184082,
|
|
"learning_rate": 9.996418774081656e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2898815.0,
|
|
"reward": 0.3109374940395355,
|
|
"reward_std": 0.4464558959007263,
|
|
"rewards/format_reward/mean": 0.453125,
|
|
"rewards/format_reward/std": 0.14689241349697113,
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 23,
|
|
"step_time": 148.10121405799873
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1393.0,
|
|
"completions/max_terminated_length": 1393.0,
|
|
"completions/mean_length": 232.359375,
|
|
"completions/mean_terminated_length": 232.359375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.16719974670559168,
|
|
"epoch": 0.017142857142857144,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 4.078112602233887,
|
|
"learning_rate": 9.994404710283998e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2993910.0,
|
|
"reward": 0.28203123807907104,
|
|
"reward_std": 0.4266301095485687,
|
|
"rewards/format_reward/mean": 0.4765625,
|
|
"rewards/format_reward/std": 0.13886408507823944,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 24,
|
|
"step_time": 73.75073616893496
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 819.0,
|
|
"completions/max_terminated_length": 819.0,
|
|
"completions/mean_length": 111.78125,
|
|
"completions/mean_terminated_length": 111.78125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.25493066385388374,
|
|
"epoch": 0.017857142857142856,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 16.993078231811523,
|
|
"learning_rate": 9.991943444144756e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3085936.0,
|
|
"reward": 0.3023437261581421,
|
|
"reward_std": 0.4352912902832031,
|
|
"rewards/format_reward/mean": 0.5234375,
|
|
"rewards/format_reward/std": 0.13886408507823944,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 25,
|
|
"step_time": 45.09112752194051
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1207.0,
|
|
"completions/max_terminated_length": 1207.0,
|
|
"completions/mean_length": 132.671875,
|
|
"completions/mean_terminated_length": 132.671875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.2280710767954588,
|
|
"epoch": 0.018571428571428572,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 15.601746559143066,
|
|
"learning_rate": 9.989035196047348e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3186115.0,
|
|
"reward": 0.20468749105930328,
|
|
"reward_std": 0.3647915720939636,
|
|
"rewards/format_reward/mean": 0.484375,
|
|
"rewards/format_reward/std": 0.1534975916147232,
|
|
"rewards/mcq_exact_match_reward/mean": 0.15625,
|
|
"rewards/mcq_exact_match_reward/std": 0.36596253514289856,
|
|
"step": 26,
|
|
"step_time": 68.13181330589578
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 808.0,
|
|
"completions/max_terminated_length": 808.0,
|
|
"completions/mean_length": 64.9375,
|
|
"completions/mean_terminated_length": 64.9375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.22438204288482666,
|
|
"epoch": 0.019285714285714285,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 11.98585033416748,
|
|
"learning_rate": 9.98568022639826e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3275551.0,
|
|
"reward": 0.4406249523162842,
|
|
"reward_std": 0.4918280243873596,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.08908708393573761,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 27,
|
|
"step_time": 46.977470892015845
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 612.0,
|
|
"completions/max_terminated_length": 612.0,
|
|
"completions/mean_length": 50.109375,
|
|
"completions/mean_terminated_length": 50.109375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.2098379861563444,
|
|
"epoch": 0.02,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 18.382869720458984,
|
|
"learning_rate": 9.981878835603716e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3357406.0,
|
|
"reward": 0.39531248807907104,
|
|
"reward_std": 0.4776528775691986,
|
|
"rewards/format_reward/mean": 0.515625,
|
|
"rewards/format_reward/std": 0.08768405020236969,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 28,
|
|
"step_time": 35.3679761699168
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.10740844532847404,
|
|
"epoch": 0.020714285714285713,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 18.927444458007812,
|
|
"learning_rate": 9.977631364042794e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3456198.0,
|
|
"reward": 0.5187499523162842,
|
|
"reward_std": 0.502967357635498,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 29,
|
|
"step_time": 4.8387698759906925
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 352.0,
|
|
"completions/max_terminated_length": 352.0,
|
|
"completions/mean_length": 15.125,
|
|
"completions/mean_terminated_length": 15.125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.197237528860569,
|
|
"epoch": 0.02142857142857143,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 18.760665893554688,
|
|
"learning_rate": 9.972938192036944e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3529918.0,
|
|
"reward": 0.3781249523162842,
|
|
"reward_std": 0.47324231266975403,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.328125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 30,
|
|
"step_time": 19.270987000956666
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 55.0,
|
|
"completions/max_terminated_length": 55.0,
|
|
"completions/mean_length": 7.53125,
|
|
"completions/mean_terminated_length": 7.53125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.14352155569940805,
|
|
"epoch": 0.02214285714285714,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 17.66056251525879,
|
|
"learning_rate": 9.967799739815924e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3604272.0,
|
|
"reward": 0.27812498807907104,
|
|
"reward_std": 0.4307500422000885,
|
|
"rewards/format_reward/mean": 0.4375,
|
|
"rewards/format_reward/std": 0.1666666716337204,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 31,
|
|
"step_time": 5.142326800851151
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.16279550455510616,
|
|
"epoch": 0.022857142857142857,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 19.25493621826172,
|
|
"learning_rate": 9.96221646748019e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3687568.0,
|
|
"reward": 0.22187498211860657,
|
|
"reward_std": 0.3802541494369507,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.171875,
|
|
"rewards/mcq_exact_match_reward/std": 0.38025420904159546,
|
|
"step": 32,
|
|
"step_time": 3.4762189310858957
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 108.0,
|
|
"completions/max_terminated_length": 108.0,
|
|
"completions/mean_length": 7.59375,
|
|
"completions/mean_terminated_length": 7.59375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.1700758682563901,
|
|
"epoch": 0.023571428571428573,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 23.92763328552246,
|
|
"learning_rate": 9.956188874959686e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3782558.0,
|
|
"reward": 0.5499999523162842,
|
|
"reward_std": 0.5039525628089905,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 33,
|
|
"step_time": 7.063230810861569
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.10504745738580823,
|
|
"epoch": 0.024285714285714285,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 17.42229461669922,
|
|
"learning_rate": 9.949717501969079e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3872686.0,
|
|
"reward": 0.26874998211860657,
|
|
"reward_std": 0.4166666269302368,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 34,
|
|
"step_time": 3.867844623979181
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 29.0,
|
|
"completions/max_terminated_length": 29.0,
|
|
"completions/mean_length": 6.359375,
|
|
"completions/mean_terminated_length": 6.359375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.18720911256968975,
|
|
"epoch": 0.025,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 26.774076461791992,
|
|
"learning_rate": 9.942802927959442e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3981189.0,
|
|
"reward": 0.29921871423721313,
|
|
"reward_std": 0.43693482875823975,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 35,
|
|
"step_time": 5.721210387942847
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.13526835106313229,
|
|
"epoch": 0.025714285714285714,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 25.46927833557129,
|
|
"learning_rate": 9.93544577206636e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4056141.0,
|
|
"reward": 0.3468749523162842,
|
|
"reward_std": 0.46049270033836365,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 36,
|
|
"step_time": 3.36784387397347
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.17059525474905968,
|
|
"epoch": 0.02642857142857143,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 20.648895263671875,
|
|
"learning_rate": 9.927646693054495e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4163493.0,
|
|
"reward": 0.2999999523162842,
|
|
"reward_std": 0.4364357590675354,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 37,
|
|
"step_time": 9.884533652977552
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 5.953125,
|
|
"completions/mean_terminated_length": 5.953125,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"entropy": 0.10067910794168711,
|
|
"epoch": 0.027142857142857142,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 24.11454963684082,
|
|
"learning_rate": 9.919406389258606e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4273250.0,
|
|
"reward": 0.28359371423721313,
|
|
"reward_std": 0.4274373948574066,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 38,
|
|
"step_time": 6.0175170440925285
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 67.0,
|
|
"completions/max_terminated_length": 67.0,
|
|
"completions/mean_length": 6.953125,
|
|
"completions/mean_terminated_length": 6.953125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.14652357157319784,
|
|
"epoch": 0.027857142857142858,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 12.282570838928223,
|
|
"learning_rate": 9.910725598521012e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4366839.0,
|
|
"reward": 0.4562499523162842,
|
|
"reward_std": 0.49501481652259827,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.40625,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 39,
|
|
"step_time": 9.129241381015163
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.07260213000699878,
|
|
"epoch": 0.02857142857142857,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 11.272303581237793,
|
|
"learning_rate": 9.901605098125526e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4477503.0,
|
|
"reward": 0.39374998211860657,
|
|
"reward_std": 0.4787135422229767,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 40,
|
|
"step_time": 4.785194783064071
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1146.0,
|
|
"completions/max_terminated_length": 1146.0,
|
|
"completions/mean_length": 39.328125,
|
|
"completions/mean_terminated_length": 39.328125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.12252951040863991,
|
|
"epoch": 0.029285714285714286,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 9.645527839660645,
|
|
"learning_rate": 9.892045704727863e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4603468.0,
|
|
"reward": 0.3937499523162842,
|
|
"reward_std": 0.4787135422229767,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 41,
|
|
"step_time": 121.67112983297557
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 83.0,
|
|
"completions/max_terminated_length": 83.0,
|
|
"completions/mean_length": 7.203125,
|
|
"completions/mean_terminated_length": 7.203125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.04064215812832117,
|
|
"epoch": 0.03,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 17.42051124572754,
|
|
"learning_rate": 9.882048274282505e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4709177.0,
|
|
"reward": 0.7687499523162842,
|
|
"reward_std": 0.4531634449958801,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.71875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 42,
|
|
"step_time": 10.29577969602542
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 6.03125,
|
|
"completions/mean_terminated_length": 6.03125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.12579507380723953,
|
|
"epoch": 0.030714285714285715,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 36.053192138671875,
|
|
"learning_rate": 9.871613701966066e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4787955.0,
|
|
"reward": 0.40859371423721313,
|
|
"reward_std": 0.4842400550842285,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 43,
|
|
"step_time": 4.568186060001608
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10.0,
|
|
"completions/max_terminated_length": 10.0,
|
|
"completions/mean_length": 6.4375,
|
|
"completions/mean_terminated_length": 6.4375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.0865603880956769,
|
|
"epoch": 0.03142857142857143,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 14.81761360168457,
|
|
"learning_rate": 9.86074292209714e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4872295.0,
|
|
"reward": 0.6539061665534973,
|
|
"reward_std": 0.4988323152065277,
|
|
"rewards/format_reward/mean": 0.4453125,
|
|
"rewards/format_reward/std": 0.15728822350502014,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 44,
|
|
"step_time": 4.5612655181321315
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.05028381128795445,
|
|
"epoch": 0.03214285714285714,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 18.105215072631836,
|
|
"learning_rate": 9.849436908052636e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4971903.0,
|
|
"reward": 0.4718749523162842,
|
|
"reward_std": 0.4977628290653229,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.421875,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 45,
|
|
"step_time": 4.8814199649496
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.0737028606235981,
|
|
"epoch": 0.032857142857142856,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 36.69444274902344,
|
|
"learning_rate": 9.837696672180618e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5051695.0,
|
|
"reward": 0.4874999523162842,
|
|
"reward_std": 0.4999999701976776,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.4375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 46,
|
|
"step_time": 3.359571039909497
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 358.0,
|
|
"completions/max_terminated_length": 358.0,
|
|
"completions/mean_length": 11.5,
|
|
"completions/mean_terminated_length": 11.5,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.0733404541388154,
|
|
"epoch": 0.03357142857142857,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 18.161724090576172,
|
|
"learning_rate": 9.825523265709665e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5126431.0,
|
|
"reward": 0.5335937142372131,
|
|
"reward_std": 0.504507839679718,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 47,
|
|
"step_time": 17.83803274697857
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.054197699995711446,
|
|
"epoch": 0.03428571428571429,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 19.445640563964844,
|
|
"learning_rate": 9.812917778654747e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5230503.0,
|
|
"reward": 0.7531249523162842,
|
|
"reward_std": 0.46049270033836365,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.703125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 48,
|
|
"step_time": 4.160447052039672
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.05320898490026593,
|
|
"epoch": 0.035,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 11.983798027038574,
|
|
"learning_rate": 9.799881339719614e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5297479.0,
|
|
"reward": 0.3937499523162842,
|
|
"reward_std": 0.4787135422229767,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 49,
|
|
"step_time": 2.485525873955339
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 6.09375,
|
|
"completions/mean_terminated_length": 6.09375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.0547908297739923,
|
|
"epoch": 0.03571428571428571,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 15.623820304870605,
|
|
"learning_rate": 9.786415116195732e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5377581.0,
|
|
"reward": 0.7687499523162842,
|
|
"reward_std": 0.4531634449958801,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.71875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 50,
|
|
"step_time": 4.019255019025877
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.07222574669867754,
|
|
"epoch": 0.03642857142857143,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 23.187658309936523,
|
|
"learning_rate": 9.772520313857775e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5490213.0,
|
|
"reward": 0.4874999523162842,
|
|
"reward_std": 0.4999999701976776,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.4375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 51,
|
|
"step_time": 6.748388390929904
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.07622226374223828,
|
|
"epoch": 0.037142857142857144,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 15.004591941833496,
|
|
"learning_rate": 9.758198176855646e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5583557.0,
|
|
"reward": 0.4718749523162842,
|
|
"reward_std": 0.4977628290653229,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.421875,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 52,
|
|
"step_time": 4.643642266979441
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.052028866310138255,
|
|
"epoch": 0.03785714285714286,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 18.302907943725586,
|
|
"learning_rate": 9.74344998760308e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5674525.0,
|
|
"reward": 0.26874998211860657,
|
|
"reward_std": 0.4166666269302368,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 53,
|
|
"step_time": 3.6161275009508245
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.07992784632369876,
|
|
"epoch": 0.03857142857142857,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 50.737979888916016,
|
|
"learning_rate": 9.72827706666282e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5768773.0,
|
|
"reward": 0.4093749523162842,
|
|
"reward_std": 0.4836103320121765,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 54,
|
|
"step_time": 4.318731640116312
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.04811420664191246,
|
|
"epoch": 0.039285714285714285,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 8.901388168334961,
|
|
"learning_rate": 9.712680772628363e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5840629.0,
|
|
"reward": 0.4406249523162842,
|
|
"reward_std": 0.49174734950065613,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 55,
|
|
"step_time": 3.043417449865956
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.034670245833694935,
|
|
"epoch": 0.04,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 38.62637710571289,
|
|
"learning_rate": 9.696662502002318e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5928837.0,
|
|
"reward": 0.3468749523162842,
|
|
"reward_std": 0.46049270033836365,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 56,
|
|
"step_time": 8.042339402018115
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.04627775074914098,
|
|
"epoch": 0.04071428571428572,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 20.798757553100586,
|
|
"learning_rate": 9.680223689071362e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6012549.0,
|
|
"reward": 0.41874998807907104,
|
|
"reward_std": 0.4930870831012726,
|
|
"rewards/format_reward/mean": 0.4375,
|
|
"rewards/format_reward/std": 0.1666666716337204,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 57,
|
|
"step_time": 3.4416783688939176
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.045997242676094174,
|
|
"epoch": 0.041428571428571426,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 14.777454376220703,
|
|
"learning_rate": 9.663365805777814e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6088061.0,
|
|
"reward": 0.5656249523162842,
|
|
"reward_std": 0.5037064552307129,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 58,
|
|
"step_time": 3.701708526001312
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.06566549651324749,
|
|
"epoch": 0.04214285714285714,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 30.113880157470703,
|
|
"learning_rate": 9.646090361587827e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6172757.0,
|
|
"reward": 0.6437499523162842,
|
|
"reward_std": 0.49501481652259827,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.59375,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 59,
|
|
"step_time": 3.5152428460423835
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.04948642780072987,
|
|
"epoch": 0.04285714285714286,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 19.78885841369629,
|
|
"learning_rate": 9.628398903356239e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6264821.0,
|
|
"reward": 0.6124999523162842,
|
|
"reward_std": 0.4999999701976776,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 60,
|
|
"step_time": 4.421045998169575
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.03131909319199622,
|
|
"epoch": 0.04357142857142857,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 34.9882698059082,
|
|
"learning_rate": 9.610293015188067e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6347405.0,
|
|
"reward": 0.7531249523162842,
|
|
"reward_std": 0.46049270033836365,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.703125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 61,
|
|
"step_time": 6.333404837932903
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.04404397588223219,
|
|
"epoch": 0.04428571428571428,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 27.348413467407227,
|
|
"learning_rate": 9.59177431829666e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6419029.0,
|
|
"reward": 0.5031249523162842,
|
|
"reward_std": 0.5017330646514893,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 62,
|
|
"step_time": 3.639777671021875
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.028264216147363186,
|
|
"epoch": 0.045,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 25.774185180664062,
|
|
"learning_rate": 9.572844470858537e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6502877.0,
|
|
"reward": 0.7531249523162842,
|
|
"reward_std": 0.46049273014068604,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.703125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 63,
|
|
"step_time": 5.851632744073868
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.04575204895809293,
|
|
"epoch": 0.045714285714285714,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.55350516786491e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6591573.0,
|
|
"reward": 0.29999998211860657,
|
|
"reward_std": 0.4364357590675354,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 64,
|
|
"step_time": 4.74795475701103
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.0018548955195001327,
|
|
"epoch": 0.04642857142857143,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.533758140969912e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6704141.0,
|
|
"reward": 0.5499999523162842,
|
|
"reward_std": 0.5039525628089905,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 65,
|
|
"step_time": 4.814983140968252
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.018711353768594563,
|
|
"epoch": 0.047142857142857146,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.513605158335562e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6782221.0,
|
|
"reward": 0.6749999523162842,
|
|
"reward_std": 0.48794999718666077,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 66,
|
|
"step_time": 3.5154523900710046
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.057149365777149796,
|
|
"epoch": 0.047857142857142855,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 20.58985137939453,
|
|
"learning_rate": 9.493048024473411e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6851269.0,
|
|
"reward": 0.6742187142372131,
|
|
"reward_std": 0.489005446434021,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 67,
|
|
"step_time": 2.9816590580740012
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.030840615974739194,
|
|
"epoch": 0.04857142857142857,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 26.677947998046875,
|
|
"learning_rate": 9.47208858008299e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6936149.0,
|
|
"reward": 0.6593749523162842,
|
|
"reward_std": 0.49174734950065613,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 68,
|
|
"step_time": 5.698866136022843
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 6.1875,
|
|
"completions/mean_terminated_length": 6.1875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.06430117995478213,
|
|
"epoch": 0.04928571428571429,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 48.29899978637695,
|
|
"learning_rate": 9.450728701886983e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 7002961.0,
|
|
"reward": 0.6859374046325684,
|
|
"reward_std": 0.49009785056114197,
|
|
"rewards/format_reward/mean": 0.453125,
|
|
"rewards/format_reward/std": 0.14689241349697113,
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 69,
|
|
"step_time": 3.3110440919408575
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.06107360986061394,
|
|
"epoch": 0.05,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.428970302463184e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7080225.0,
|
|
"reward": 0.5499999523162842,
|
|
"reward_std": 0.5039525628089905,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 70,
|
|
"step_time": 3.588115891034249
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.019286671769805253,
|
|
"epoch": 0.05071428571428571,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 10.453094482421875,
|
|
"learning_rate": 9.406815330073244e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 7157009.0,
|
|
"reward": 0.4093749523162842,
|
|
"reward_std": 0.4836103320121765,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 71,
|
|
"step_time": 4.012491253030021
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.005534585099667311,
|
|
"epoch": 0.05142857142857143,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.384265768488224e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7251009.0,
|
|
"reward": 1.0499999523162842,
|
|
"reward_std": 0.0,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 1.0,
|
|
"rewards/mcq_exact_match_reward/std": 0.0,
|
|
"step": 72,
|
|
"step_time": 4.052724620094523
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.03679531207308173,
|
|
"epoch": 0.052142857142857144,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 37.07649230957031,
|
|
"learning_rate": 9.36132363681097e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7329065.0,
|
|
"reward": 0.34609371423721313,
|
|
"reward_std": 0.461046427488327,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 73,
|
|
"step_time": 3.5614252468803898
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.06942176586017013,
|
|
"epoch": 0.05285714285714286,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 28.538206100463867,
|
|
"learning_rate": 9.337990989295304e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 7427713.0,
|
|
"reward": 0.26874998211860657,
|
|
"reward_std": 0.4166666269302368,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 74,
|
|
"step_time": 4.526347325881943
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.02778073405715986,
|
|
"epoch": 0.05357142857142857,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 17.399362564086914,
|
|
"learning_rate": 9.314269915162114e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7527089.0,
|
|
"reward": 0.5812499523162842,
|
|
"reward_std": 0.502967357635498,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.53125,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 75,
|
|
"step_time": 5.331380940915551
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.06143995188176632,
|
|
"epoch": 0.054285714285714284,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 37.28912353515625,
|
|
"learning_rate": 9.290162538412255e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7613169.0,
|
|
"reward": 0.5968749523162842,
|
|
"reward_std": 0.5017330646514893,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.546875,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 76,
|
|
"step_time": 4.3536295120138675
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.0089755498120212,
|
|
"epoch": 0.055,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.265671017636382e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7690257.0,
|
|
"reward": 0.9249999523162842,
|
|
"reward_std": 0.3333333134651184,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.875,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 77,
|
|
"step_time": 3.8471228859853
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.0636584812309593,
|
|
"epoch": 0.055714285714285716,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 75.22062683105469,
|
|
"learning_rate": 9.240797545821666e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 7786777.0,
|
|
"reward": 0.5968749523162842,
|
|
"reward_std": 0.5017330646514893,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.546875,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 78,
|
|
"step_time": 4.731224032060709
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.010438685640110634,
|
|
"epoch": 0.056428571428571425,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.215544350155422e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7879257.0,
|
|
"reward": 0.6749999523162842,
|
|
"reward_std": 0.48794999718666077,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 79,
|
|
"step_time": 5.060094357933849
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.003100438669207506,
|
|
"epoch": 0.05714285714285714,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.189913691825699e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7939841.0,
|
|
"reward": 0.6749999523162842,
|
|
"reward_std": 0.48794999718666077,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 80,
|
|
"step_time": 2.383018973923754
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.0016284913563140435,
|
|
"epoch": 0.05785714285714286,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.163907865818806e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8000473.0,
|
|
"reward": 0.6749999523162842,
|
|
"reward_std": 0.48794999718666077,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 81,
|
|
"step_time": 2.3158561410964467
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.01943864591885358,
|
|
"epoch": 0.05857142857142857,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 31.108356475830078,
|
|
"learning_rate": 9.137529200713809e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 8074729.0,
|
|
"reward": 0.3937499523162842,
|
|
"reward_std": 0.4787135422229767,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 82,
|
|
"step_time": 3.5820812580059282
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.019582699635066092,
|
|
"epoch": 0.05928571428571429,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.11078005847405e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8149137.0,
|
|
"reward": 0.29999998211860657,
|
|
"reward_std": 0.4364357590675354,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 83,
|
|
"step_time": 2.6342398920096457
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.04416700848378241,
|
|
"epoch": 0.06,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 41.02012634277344,
|
|
"learning_rate": 9.083662834235629e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8220905.0,
|
|
"reward": 0.4406249523162842,
|
|
"reward_std": 0.49174734950065613,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 84,
|
|
"step_time": 3.4052489919704385
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.02520724307396449,
|
|
"epoch": 0.060714285714285714,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 17.11612892150879,
|
|
"learning_rate": 9.056179956092961e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8312417.0,
|
|
"reward": 0.5656249523162842,
|
|
"reward_std": 0.5037064552307129,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 85,
|
|
"step_time": 5.020384737115819
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.03604305279441178,
|
|
"epoch": 0.06142857142857143,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 81.0152359008789,
|
|
"learning_rate": 9.028333884881356e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 8386809.0,
|
|
"reward": 0.6124999523162842,
|
|
"reward_std": 0.4999999701976776,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 86,
|
|
"step_time": 3.479461514914874
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.04329964122734964,
|
|
"epoch": 0.062142857142857146,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 48.07200622558594,
|
|
"learning_rate": 9.000127113956672e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8471817.0,
|
|
"reward": 0.3468749523162842,
|
|
"reward_std": 0.46049270033836365,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 87,
|
|
"step_time": 4.475756738916971
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.028254332719370723,
|
|
"epoch": 0.06285714285714286,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 135.7811737060547,
|
|
"learning_rate": 8.971562168972064e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 8560945.0,
|
|
"reward": 0.6437499523162842,
|
|
"reward_std": 0.49501481652259827,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.59375,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 88,
|
|
"step_time": 6.017775994958356
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.04081520880572498,
|
|
"epoch": 0.06357142857142857,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 9.588271141052246,
|
|
"learning_rate": 8.942641607651828e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8668209.0,
|
|
"reward": 0.6906249523162842,
|
|
"reward_std": 0.4836103320121765,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 89,
|
|
"step_time": 3.933919732866343
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.012149464862886816,
|
|
"epoch": 0.06428571428571428,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 12.276681900024414,
|
|
"learning_rate": 8.91336801956239e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 8759393.0,
|
|
"reward": 0.6593749523162842,
|
|
"reward_std": 0.49174734950065613,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 90,
|
|
"step_time": 4.012565175944474
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.00394415354821831,
|
|
"epoch": 0.065,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 42.945594787597656,
|
|
"learning_rate": 8.883744025880427e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 8841201.0,
|
|
"reward": 0.6593749523162842,
|
|
"reward_std": 0.49174734950065613,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 91,
|
|
"step_time": 3.554952215985395
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.020873073022812605,
|
|
"epoch": 0.06571428571428571,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.853772279158165e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8926921.0,
|
|
"reward": 0.4249999523162842,
|
|
"reward_std": 0.48794999718666077,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 92,
|
|
"step_time": 3.4018443040549755
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.017624020569201093,
|
|
"epoch": 0.06642857142857143,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.823455463085873e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9020281.0,
|
|
"reward": 0.6749999523162842,
|
|
"reward_std": 0.48794999718666077,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 93,
|
|
"step_time": 3.9275616111117415
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.029963097535073757,
|
|
"epoch": 0.06714285714285714,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 21.280263900756836,
|
|
"learning_rate": 8.792796292251559e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9108273.0,
|
|
"reward": 0.5656249523162842,
|
|
"reward_std": 0.5037064552307129,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 94,
|
|
"step_time": 4.094957825960591
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.01988843083381653,
|
|
"epoch": 0.06785714285714285,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.761797511897906e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9201273.0,
|
|
"reward": 0.6749999523162842,
|
|
"reward_std": 0.48794999718666077,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 95,
|
|
"step_time": 4.90473719505826
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.002151613953174092,
|
|
"epoch": 0.06857142857142857,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.730461897676463e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9289649.0,
|
|
"reward": 1.0499999523162842,
|
|
"reward_std": 0.0,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 1.0,
|
|
"rewards/mcq_exact_match_reward/std": 0.0,
|
|
"step": 96,
|
|
"step_time": 5.864397282944992
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.028051164787029848,
|
|
"epoch": 0.06928571428571428,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.698792255399103e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9385057.0,
|
|
"reward": 0.6749999523162842,
|
|
"reward_std": 0.48795002698898315,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 97,
|
|
"step_time": 5.353533354995307
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.04004533472470939,
|
|
"epoch": 0.07,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 42.5,
|
|
"learning_rate": 8.666791420786803e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 9463201.0,
|
|
"reward": 0.8781249523162842,
|
|
"reward_std": 0.38025417923927307,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.828125,
|
|
"rewards/mcq_exact_match_reward/std": 0.38025420904159546,
|
|
"step": 98,
|
|
"step_time": 3.3163292090175673
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.033746465924195945,
|
|
"epoch": 0.07071428571428572,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 36.30033874511719,
|
|
"learning_rate": 8.634462259215718e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 9550801.0,
|
|
"reward": 0.4718749523162842,
|
|
"reward_std": 0.4977628290653229,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.421875,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 99,
|
|
"step_time": 4.70873619185295
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"entropy": 0.020849275402724743,
|
|
"epoch": 0.07142857142857142,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 40.67396926879883,
|
|
"learning_rate": 8.601807665460619e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 9638393.0,
|
|
"reward": 0.7218749523162842,
|
|
"reward_std": 0.47324231266975403,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.671875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 100,
|
|
"step_time": 4.529204568010755
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 350,
|
|
"num_input_tokens_seen": 9638393,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 50,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|