2735 lines
98 KiB
JSON
2735 lines
98 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.125,
|
|
"eval_steps": 500,
|
|
"global_step": 100,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1743.0,
|
|
"completions/mean_length": 471.375,
|
|
"completions/mean_terminated_length": 446.3492431640625,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"epoch": 0.00125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 27.719789505004883,
|
|
"learning_rate": 0.0,
|
|
"loss": -0.0,
|
|
"num_tokens": 107576.0,
|
|
"reward": 0.32109373807907104,
|
|
"reward_std": 0.35813236236572266,
|
|
"rewards/format_reward/mean": 0.3984375,
|
|
"rewards/format_reward/std": 0.31090864539146423,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 1
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1343.0,
|
|
"completions/max_terminated_length": 1343.0,
|
|
"completions/mean_length": 420.609375,
|
|
"completions/mean_terminated_length": 420.609375,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 15.845207214355469,
|
|
"learning_rate": 1e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 217615.0,
|
|
"reward": 0.0898437350988388,
|
|
"reward_std": 0.1637348234653473,
|
|
"rewards/format_reward/mean": 0.2734375,
|
|
"rewards/format_reward/std": 0.2662152051925659,
|
|
"rewards/mcq_exact_match_reward/mean": 0.0625,
|
|
"rewards/mcq_exact_match_reward/std": 0.24397502839565277,
|
|
"step": 2
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1600.0,
|
|
"completions/mean_length": 604.359375,
|
|
"completions/mean_terminated_length": 581.4444580078125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.00375,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 3.8941690921783447,
|
|
"learning_rate": 2e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 346454.0,
|
|
"reward": 0.2398437261581421,
|
|
"reward_std": 0.34310027956962585,
|
|
"rewards/format_reward/mean": 0.3671875,
|
|
"rewards/format_reward/std": 0.23974503576755524,
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 3
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1399.0,
|
|
"completions/max_terminated_length": 1399.0,
|
|
"completions/mean_length": 485.09375,
|
|
"completions/mean_terminated_length": 485.09375,
|
|
"completions/min_length": 15.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.005,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 3.741612672805786,
|
|
"learning_rate": 3e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 453380.0,
|
|
"reward": 0.18437498807907104,
|
|
"reward_std": 0.2578701674938202,
|
|
"rewards/format_reward/mean": 0.4375,
|
|
"rewards/format_reward/std": 0.33923351764678955,
|
|
"rewards/mcq_exact_match_reward/mean": 0.140625,
|
|
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
|
|
"step": 4
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1614.0,
|
|
"completions/mean_length": 588.8125,
|
|
"completions/mean_terminated_length": 517.0491333007812,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.00625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.286240339279175,
|
|
"learning_rate": 4e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 582664.0,
|
|
"reward": 0.25390625,
|
|
"reward_std": 0.264077365398407,
|
|
"rewards/format_reward/mean": 0.3515625,
|
|
"rewards/format_reward/std": 0.24688033759593964,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1609.0,
|
|
"completions/mean_length": 472.859375,
|
|
"completions/mean_terminated_length": 447.857177734375,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"epoch": 0.0075,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 14.97139835357666,
|
|
"learning_rate": 5e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 689791.0,
|
|
"reward": 0.17031249403953552,
|
|
"reward_std": 0.27876684069633484,
|
|
"rewards/format_reward/mean": 0.296875,
|
|
"rewards/format_reward/std": 0.2916666865348816,
|
|
"rewards/mcq_exact_match_reward/mean": 0.140625,
|
|
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
|
|
"step": 6
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1476.0,
|
|
"completions/mean_length": 554.796875,
|
|
"completions/mean_terminated_length": 531.0952758789062,
|
|
"completions/min_length": 7.0,
|
|
"completions/min_terminated_length": 7.0,
|
|
"epoch": 0.00875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.5775206089019775,
|
|
"learning_rate": 6e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 802154.0,
|
|
"reward": 0.26953125,
|
|
"reward_std": 0.3855266869068146,
|
|
"rewards/format_reward/mean": 0.3515625,
|
|
"rewards/format_reward/std": 0.2302463799715042,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 7
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1346.0,
|
|
"completions/mean_length": 447.140625,
|
|
"completions/mean_terminated_length": 421.7301940917969,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.01,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.306600570678711,
|
|
"learning_rate": 7e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 909003.0,
|
|
"reward": 0.3125,
|
|
"reward_std": 0.41604068875312805,
|
|
"rewards/format_reward/mean": 0.3125,
|
|
"rewards/format_reward/std": 0.2745848298072815,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 8
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1898.0,
|
|
"completions/max_terminated_length": 1898.0,
|
|
"completions/mean_length": 663.96875,
|
|
"completions/mean_terminated_length": 663.96875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.01125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.932246208190918,
|
|
"learning_rate": 8e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1022305.0,
|
|
"reward": 0.1484375,
|
|
"reward_std": 0.2908669710159302,
|
|
"rewards/format_reward/mean": 0.390625,
|
|
"rewards/format_reward/std": 0.2592533528804779,
|
|
"rewards/mcq_exact_match_reward/mean": 0.109375,
|
|
"rewards/mcq_exact_match_reward/std": 0.3145764470100403,
|
|
"step": 9
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1804.0,
|
|
"completions/max_terminated_length": 1804.0,
|
|
"completions/mean_length": 508.078125,
|
|
"completions/mean_terminated_length": 508.078125,
|
|
"completions/min_length": 29.0,
|
|
"completions/min_terminated_length": 29.0,
|
|
"epoch": 0.0125,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.8813296556472778,
|
|
"learning_rate": 9e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1149174.0,
|
|
"reward": 0.39531248807907104,
|
|
"reward_std": 0.25146484375,
|
|
"rewards/format_reward/mean": 0.359375,
|
|
"rewards/format_reward/std": 0.2741328477859497,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1901.0,
|
|
"completions/max_terminated_length": 1901.0,
|
|
"completions/mean_length": 587.140625,
|
|
"completions/mean_terminated_length": 587.140625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.01375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.928985118865967,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 1288087.0,
|
|
"reward": 0.26640623807907104,
|
|
"reward_std": 0.413688063621521,
|
|
"rewards/format_reward/mean": 0.3203125,
|
|
"rewards/format_reward/std": 0.27265870571136475,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 11
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1818.0,
|
|
"completions/max_terminated_length": 1818.0,
|
|
"completions/mean_length": 631.4375,
|
|
"completions/mean_terminated_length": 631.4375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.015,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.051449298858643,
|
|
"learning_rate": 9.999316524962345e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1432555.0,
|
|
"reward": 0.24062499403953552,
|
|
"reward_std": 0.37458372116088867,
|
|
"rewards/format_reward/mean": 0.375,
|
|
"rewards/format_reward/std": 0.2357022762298584,
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 12
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1405.0,
|
|
"completions/max_terminated_length": 1405.0,
|
|
"completions/mean_length": 502.890625,
|
|
"completions/mean_terminated_length": 502.890625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.01625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.412359237670898,
|
|
"learning_rate": 9.99726628670463e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1562860.0,
|
|
"reward": 0.3140624761581421,
|
|
"reward_std": 0.4029204845428467,
|
|
"rewards/format_reward/mean": 0.328125,
|
|
"rewards/format_reward/std": 0.23935678601264954,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 13
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 911.0,
|
|
"completions/max_terminated_length": 911.0,
|
|
"completions/mean_length": 404.0,
|
|
"completions/mean_terminated_length": 404.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0175,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 5.526406288146973,
|
|
"learning_rate": 9.993849845741523e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1660916.0,
|
|
"reward": 0.296875,
|
|
"reward_std": 0.35833704471588135,
|
|
"rewards/format_reward/mean": 0.46875,
|
|
"rewards/format_reward/std": 0.25,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 14
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1440.0,
|
|
"completions/mean_length": 540.46875,
|
|
"completions/mean_terminated_length": 491.83868408203125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.01875,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.893363952636719,
|
|
"learning_rate": 9.989068136093872e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1783450.0,
|
|
"reward": 0.38203123211860657,
|
|
"reward_std": 0.30731916427612305,
|
|
"rewards/format_reward/mean": 0.3828125,
|
|
"rewards/format_reward/std": 0.2313210517168045,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 15
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1407.0,
|
|
"completions/max_terminated_length": 1407.0,
|
|
"completions/mean_length": 553.1875,
|
|
"completions/mean_terminated_length": 553.1875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.02,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.701499342918396,
|
|
"learning_rate": 9.982922465033348e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1904958.0,
|
|
"reward": 0.5335937738418579,
|
|
"reward_std": 0.41488444805145264,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.1406387835741043,
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 16
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1545.0,
|
|
"completions/max_terminated_length": 1545.0,
|
|
"completions/mean_length": 344.5,
|
|
"completions/mean_terminated_length": 344.5,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.02125,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 12.08929443359375,
|
|
"learning_rate": 9.975414512725056e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2031982.0,
|
|
"reward": 0.500781238079071,
|
|
"reward_std": 0.28046733140945435,
|
|
"rewards/format_reward/mean": 0.4765625,
|
|
"rewards/format_reward/std": 0.10652101784944534,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 17
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1117.0,
|
|
"completions/mean_length": 473.9375,
|
|
"completions/mean_terminated_length": 448.952392578125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0225,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 8.033034324645996,
|
|
"learning_rate": 9.966546331768192e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2146218.0,
|
|
"reward": 0.606249988079071,
|
|
"reward_std": 0.4029581844806671,
|
|
"rewards/format_reward/mean": 0.4375,
|
|
"rewards/format_reward/std": 0.24397502839565277,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 18
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1098.0,
|
|
"completions/max_terminated_length": 1098.0,
|
|
"completions/mean_length": 385.65625,
|
|
"completions/mean_terminated_length": 385.65625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.02375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.02350425720215,
|
|
"learning_rate": 9.956320346634875e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2258940.0,
|
|
"reward": 0.36953121423721313,
|
|
"reward_std": 0.33910423517227173,
|
|
"rewards/format_reward/mean": 0.4140625,
|
|
"rewards/format_reward/std": 0.2280818521976471,
|
|
"rewards/mcq_exact_match_reward/mean": 0.328125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 19
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1207.0,
|
|
"completions/max_terminated_length": 1207.0,
|
|
"completions/mean_length": 400.625,
|
|
"completions/mean_terminated_length": 400.625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.332343101501465,
|
|
"learning_rate": 9.944739353007341e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2363068.0,
|
|
"reward": 0.2867187559604645,
|
|
"reward_std": 0.3396008610725403,
|
|
"rewards/format_reward/mean": 0.5234375,
|
|
"rewards/format_reward/std": 0.2735668122768402,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 20
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1964.0,
|
|
"completions/max_terminated_length": 1964.0,
|
|
"completions/mean_length": 399.25,
|
|
"completions/mean_terminated_length": 399.25,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.02625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 11.555954933166504,
|
|
"learning_rate": 9.931806517013612e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2465620.0,
|
|
"reward": 0.3179687559604645,
|
|
"reward_std": 0.44024717807769775,
|
|
"rewards/format_reward/mean": 0.5234375,
|
|
"rewards/format_reward/std": 0.2735668122768402,
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 21
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1505.0,
|
|
"completions/max_terminated_length": 1505.0,
|
|
"completions/mean_length": 327.234375,
|
|
"completions/mean_terminated_length": 327.234375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0275,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 4.289510726928711,
|
|
"learning_rate": 9.917525374361911e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2565427.0,
|
|
"reward": 0.28984373807907104,
|
|
"reward_std": 0.3399752378463745,
|
|
"rewards/format_reward/mean": 0.5546875,
|
|
"rewards/format_reward/std": 0.2538151443004608,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 22
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 940.0,
|
|
"completions/max_terminated_length": 940.0,
|
|
"completions/mean_length": 297.296875,
|
|
"completions/mean_terminated_length": 297.296875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.02875,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 13.854007720947266,
|
|
"learning_rate": 9.901899829374047e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2667206.0,
|
|
"reward": 0.32890623807907104,
|
|
"reward_std": 0.2701229453086853,
|
|
"rewards/format_reward/mean": 0.4765625,
|
|
"rewards/format_reward/std": 0.20758795738220215,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 23
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1274.0,
|
|
"completions/max_terminated_length": 1274.0,
|
|
"completions/mean_length": 195.265625,
|
|
"completions/mean_terminated_length": 195.265625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.03,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 16.784299850463867,
|
|
"learning_rate": 9.884934153917996e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2755663.0,
|
|
"reward": 0.31640625,
|
|
"reward_std": 0.1695163995027542,
|
|
"rewards/format_reward/mean": 0.5078125,
|
|
"rewards/format_reward/std": 0.20877929031848907,
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 24
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1681.0,
|
|
"completions/max_terminated_length": 1681.0,
|
|
"completions/mean_length": 253.875,
|
|
"completions/mean_terminated_length": 253.875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.03125,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 25.41985511779785,
|
|
"learning_rate": 9.866632986240029e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2837511.0,
|
|
"reward": 0.2749999761581421,
|
|
"reward_std": 0.20281967520713806,
|
|
"rewards/format_reward/mean": 0.5625,
|
|
"rewards/format_reward/std": 0.208927720785141,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 25
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1067.0,
|
|
"completions/max_terminated_length": 1067.0,
|
|
"completions/mean_length": 199.59375,
|
|
"completions/mean_terminated_length": 199.59375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0325,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 8.237688064575195,
|
|
"learning_rate": 9.847001329696652e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2942205.0,
|
|
"reward": 0.38984376192092896,
|
|
"reward_std": 0.24267949163913727,
|
|
"rewards/format_reward/mean": 0.6171875,
|
|
"rewards/format_reward/std": 0.21347814798355103,
|
|
"rewards/mcq_exact_match_reward/mean": 0.328125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 26
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 492.0,
|
|
"completions/max_terminated_length": 492.0,
|
|
"completions/mean_length": 92.25,
|
|
"completions/mean_terminated_length": 92.25,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.03375,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 15.814329147338867,
|
|
"learning_rate": 9.826044551386742e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3037685.0,
|
|
"reward": 0.526562511920929,
|
|
"reward_std": 0.17365704476833344,
|
|
"rewards/format_reward/mean": 0.578125,
|
|
"rewards/format_reward/std": 0.23935678601264954,
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 27
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 677.0,
|
|
"completions/max_terminated_length": 677.0,
|
|
"completions/mean_length": 136.734375,
|
|
"completions/mean_terminated_length": 136.734375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.035,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 16.19939613342285,
|
|
"learning_rate": 9.803768380684242e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3138908.0,
|
|
"reward": 0.4531249701976776,
|
|
"reward_std": 0.23708730936050415,
|
|
"rewards/format_reward/mean": 0.625,
|
|
"rewards/format_reward/std": 0.2182178944349289,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 28
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 543.0,
|
|
"completions/max_terminated_length": 543.0,
|
|
"completions/mean_length": 85.0625,
|
|
"completions/mean_terminated_length": 85.0625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.03625,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 20.77176284790039,
|
|
"learning_rate": 9.780178907671788e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3233712.0,
|
|
"reward": 0.3414062261581421,
|
|
"reward_std": 0.28602826595306396,
|
|
"rewards/format_reward/mean": 0.6015625,
|
|
"rewards/format_reward/std": 0.2387082874774933,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 29
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 832.0,
|
|
"completions/max_terminated_length": 832.0,
|
|
"completions/mean_length": 79.84375,
|
|
"completions/mean_terminated_length": 79.84375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0375,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 12.60765266418457,
|
|
"learning_rate": 9.755282581475767e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3333926.0,
|
|
"reward": 0.16249999403953552,
|
|
"reward_std": 0.20131680369377136,
|
|
"rewards/format_reward/mean": 0.53125,
|
|
"rewards/format_reward/std": 0.1510545015335083,
|
|
"rewards/mcq_exact_match_reward/mean": 0.109375,
|
|
"rewards/mcq_exact_match_reward/std": 0.3145764470100403,
|
|
"step": 30
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 555.0,
|
|
"completions/max_terminated_length": 555.0,
|
|
"completions/mean_length": 75.1875,
|
|
"completions/mean_terminated_length": 75.1875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.03875,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 15.49142074584961,
|
|
"learning_rate": 9.729086208503173e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3423410.0,
|
|
"reward": 0.6078124642372131,
|
|
"reward_std": 0.270576536655426,
|
|
"rewards/format_reward/mean": 0.609375,
|
|
"rewards/format_reward/std": 0.2083333432674408,
|
|
"rewards/mcq_exact_match_reward/mean": 0.546875,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 31
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 817.0,
|
|
"completions/max_terminated_length": 817.0,
|
|
"completions/mean_length": 53.890625,
|
|
"completions/mean_terminated_length": 53.890625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.04,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 22.353435516357422,
|
|
"learning_rate": 9.701596950580807e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3516563.0,
|
|
"reward": 0.31953126192092896,
|
|
"reward_std": 0.2242286503314972,
|
|
"rewards/format_reward/mean": 0.6953125,
|
|
"rewards/format_reward/std": 0.24587368965148926,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 32
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 172.0,
|
|
"completions/max_terminated_length": 172.0,
|
|
"completions/mean_length": 20.453125,
|
|
"completions/mean_terminated_length": 20.453125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.04125,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 12.851003646850586,
|
|
"learning_rate": 9.672822322997304e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3591016.0,
|
|
"reward": 0.668749988079071,
|
|
"reward_std": 0.2041158676147461,
|
|
"rewards/format_reward/mean": 0.75,
|
|
"rewards/format_reward/std": 0.2519763112068176,
|
|
"rewards/mcq_exact_match_reward/mean": 0.59375,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 33
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 405.0,
|
|
"completions/max_terminated_length": 405.0,
|
|
"completions/mean_length": 58.453125,
|
|
"completions/mean_terminated_length": 58.453125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0425,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 15.381024360656738,
|
|
"learning_rate": 9.642770192448535e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3684013.0,
|
|
"reward": 0.5406250357627869,
|
|
"reward_std": 0.3712288737297058,
|
|
"rewards/format_reward/mean": 0.71875,
|
|
"rewards/format_reward/std": 0.25,
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 34
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 620.0,
|
|
"completions/max_terminated_length": 620.0,
|
|
"completions/mean_length": 61.640625,
|
|
"completions/mean_terminated_length": 61.640625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.04375,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 16.52124786376953,
|
|
"learning_rate": 9.611448774886923e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3761286.0,
|
|
"reward": 0.33124998211860657,
|
|
"reward_std": 0.25726157426834106,
|
|
"rewards/format_reward/mean": 0.96875,
|
|
"rewards/format_reward/std": 0.12198751419782639,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 35
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 844.0,
|
|
"completions/mean_length": 94.890625,
|
|
"completions/mean_terminated_length": 63.888893127441406,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.045,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 19.276613235473633,
|
|
"learning_rate": 9.578866633275286e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3846599.0,
|
|
"reward": 0.31640625,
|
|
"reward_std": 0.2186937928199768,
|
|
"rewards/format_reward/mean": 0.8203125,
|
|
"rewards/format_reward/std": 0.2576941251754761,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 36
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 362.0,
|
|
"completions/max_terminated_length": 362.0,
|
|
"completions/mean_length": 44.015625,
|
|
"completions/mean_terminated_length": 44.015625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.04625,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 28.070011138916016,
|
|
"learning_rate": 9.545032675245813e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3917384.0,
|
|
"reward": 0.542187511920929,
|
|
"reward_std": 0.21420830488204956,
|
|
"rewards/format_reward/mean": 0.890625,
|
|
"rewards/format_reward/std": 0.2083333432674408,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 37
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 167.0,
|
|
"completions/mean_length": 70.375,
|
|
"completions/mean_terminated_length": 38.984130859375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0475,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 24.695430755615234,
|
|
"learning_rate": 9.509956150664795e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3998968.0,
|
|
"reward": 0.45078128576278687,
|
|
"reward_std": 0.18445391952991486,
|
|
"rewards/format_reward/mean": 0.9140625,
|
|
"rewards/format_reward/std": 0.209963858127594,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 38
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 559.0,
|
|
"completions/max_terminated_length": 559.0,
|
|
"completions/mean_length": 44.234375,
|
|
"completions/mean_terminated_length": 44.234375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.04875,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 9.522443771362305,
|
|
"learning_rate": 9.473646649103817e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4083311.0,
|
|
"reward": 0.2914062738418579,
|
|
"reward_std": 0.18989473581314087,
|
|
"rewards/format_reward/mean": 0.8828125,
|
|
"rewards/format_reward/std": 0.21347814798355103,
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 39
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 225.0,
|
|
"completions/max_terminated_length": 225.0,
|
|
"completions/mean_length": 36.28125,
|
|
"completions/mean_terminated_length": 36.28125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.05,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 36.65068054199219,
|
|
"learning_rate": 9.436114097218058e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4155497.0,
|
|
"reward": 0.43906253576278687,
|
|
"reward_std": 0.20156370103359222,
|
|
"rewards/format_reward/mean": 0.953125,
|
|
"rewards/format_reward/std": 0.14689241349697113,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 40
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 297.0,
|
|
"completions/max_terminated_length": 297.0,
|
|
"completions/mean_length": 35.578125,
|
|
"completions/mean_terminated_length": 35.578125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.05125,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 24.931636810302734,
|
|
"learning_rate": 9.397368756032444e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4249622.0,
|
|
"reward": 0.5,
|
|
"reward_std": 0.23814013600349426,
|
|
"rewards/format_reward/mean": 0.9375,
|
|
"rewards/format_reward/std": 0.1666666716337204,
|
|
"rewards/mcq_exact_match_reward/mean": 0.40625,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 41
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 285.0,
|
|
"completions/max_terminated_length": 285.0,
|
|
"completions/mean_length": 29.625,
|
|
"completions/mean_terminated_length": 29.625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0525,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 15.338611602783203,
|
|
"learning_rate": 9.357421218136386e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4333710.0,
|
|
"reward": 0.3531250059604645,
|
|
"reward_std": 0.1804211586713791,
|
|
"rewards/format_reward/mean": 0.875,
|
|
"rewards/format_reward/std": 0.2182178944349289,
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 42
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 710.0,
|
|
"completions/max_terminated_length": 710.0,
|
|
"completions/mean_length": 72.84375,
|
|
"completions/mean_terminated_length": 72.84375,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.05375,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 10.507906913757324,
|
|
"learning_rate": 9.316282404787869e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4440996.0,
|
|
"reward": 0.4296875,
|
|
"reward_std": 0.21748682856559753,
|
|
"rewards/format_reward/mean": 0.859375,
|
|
"rewards/format_reward/std": 0.3503824472427368,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 43
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 101.0,
|
|
"completions/max_terminated_length": 101.0,
|
|
"completions/mean_length": 17.46875,
|
|
"completions/mean_terminated_length": 17.46875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.055,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 16.530738830566406,
|
|
"learning_rate": 9.273963562927694e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4527058.0,
|
|
"reward": 0.5625,
|
|
"reward_std": 0.2709311842918396,
|
|
"rewards/format_reward/mean": 0.9375,
|
|
"rewards/format_reward/std": 0.1666666716337204,
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 44
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 274.0,
|
|
"completions/max_terminated_length": 274.0,
|
|
"completions/mean_length": 38.5625,
|
|
"completions/mean_terminated_length": 38.5625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.05625,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 9.306171417236328,
|
|
"learning_rate": 9.230476262104676e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4591334.0,
|
|
"reward": 0.6625000238418579,
|
|
"reward_std": 0.23356688022613525,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 45
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 664.0,
|
|
"completions/max_terminated_length": 664.0,
|
|
"completions/mean_length": 49.96875,
|
|
"completions/mean_terminated_length": 49.96875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0575,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 8.561665534973145,
|
|
"learning_rate": 9.185832391312642e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4671492.0,
|
|
"reward": 0.3890625238418579,
|
|
"reward_std": 0.1695934236049652,
|
|
"rewards/format_reward/mean": 0.921875,
|
|
"rewards/format_reward/std": 0.18298126757144928,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 46
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1079.0,
|
|
"completions/mean_length": 102.84375,
|
|
"completions/mean_terminated_length": 71.96826171875,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.05875,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 5.362051486968994,
|
|
"learning_rate": 9.1400441557401e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4742906.0,
|
|
"reward": 0.612500011920929,
|
|
"reward_std": 0.10678248107433319,
|
|
"rewards/format_reward/mean": 0.96875,
|
|
"rewards/format_reward/std": 0.1510545015335083,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 47
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 275.0,
|
|
"completions/max_terminated_length": 275.0,
|
|
"completions/mean_length": 43.265625,
|
|
"completions/mean_terminated_length": 43.265625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.06,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 9.450302124023438,
|
|
"learning_rate": 9.093124073433462e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4808035.0,
|
|
"reward": 0.4906250238418579,
|
|
"reward_std": 0.2109457552433014,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 48
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 264.0,
|
|
"completions/max_terminated_length": 264.0,
|
|
"completions/mean_length": 27.03125,
|
|
"completions/mean_terminated_length": 27.03125,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.06125,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 7.115367889404297,
|
|
"learning_rate": 9.045084971874737e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4889933.0,
|
|
"reward": 0.7406250238418579,
|
|
"reward_std": 0.12255740165710449,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 49
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 194.0,
|
|
"completions/max_terminated_length": 194.0,
|
|
"completions/mean_length": 28.4375,
|
|
"completions/mean_terminated_length": 28.4375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0625,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 15.029036521911621,
|
|
"learning_rate": 8.995939984474623e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4971121.0,
|
|
"reward": 0.8031250238418579,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.703125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 50
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 771.0,
|
|
"completions/max_terminated_length": 771.0,
|
|
"completions/mean_length": 80.90625,
|
|
"completions/mean_terminated_length": 80.90625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.06375,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 30.694612503051758,
|
|
"learning_rate": 8.945702546981968e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5048643.0,
|
|
"reward": 0.6203124523162842,
|
|
"reward_std": 0.07164573669433594,
|
|
"rewards/format_reward/mean": 0.890625,
|
|
"rewards/format_reward/std": 0.2592533528804779,
|
|
"rewards/mcq_exact_match_reward/mean": 0.53125,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 51
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 455.0,
|
|
"completions/max_terminated_length": 455.0,
|
|
"completions/mean_length": 37.765625,
|
|
"completions/mean_terminated_length": 37.765625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.065,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 12.923490524291992,
|
|
"learning_rate": 8.894386393810562e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5134100.0,
|
|
"reward": 0.5046875476837158,
|
|
"reward_std": 0.11330723762512207,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.125,
|
|
"rewards/mcq_exact_match_reward/mean": 0.40625,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 52
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 473.0,
|
|
"completions/max_terminated_length": 473.0,
|
|
"completions/mean_length": 41.296875,
|
|
"completions/mean_terminated_length": 41.296875,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.06625,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 12.523974418640137,
|
|
"learning_rate": 8.842005554284295e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5252135.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.22461533546447754,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 53
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 441.0,
|
|
"completions/max_terminated_length": 441.0,
|
|
"completions/mean_length": 74.453125,
|
|
"completions/mean_terminated_length": 74.453125,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0675,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 15.70406723022461,
|
|
"learning_rate": 8.788574348801674e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5334148.0,
|
|
"reward": 0.659375011920929,
|
|
"reward_std": 0.25896912813186646,
|
|
"rewards/format_reward/mean": 0.96875,
|
|
"rewards/format_reward/std": 0.17536810040473938,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 54
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 671.0,
|
|
"completions/max_terminated_length": 671.0,
|
|
"completions/mean_length": 46.109375,
|
|
"completions/mean_terminated_length": 46.109375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.06875,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 23.37506103515625,
|
|
"learning_rate": 8.734107384920769e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5411715.0,
|
|
"reward": 0.5531250238418579,
|
|
"reward_std": 0.17358146607875824,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 55
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 628.0,
|
|
"completions/max_terminated_length": 628.0,
|
|
"completions/mean_length": 40.359375,
|
|
"completions/mean_terminated_length": 40.359375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.07,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 16.58426856994629,
|
|
"learning_rate": 8.678619553365658e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5495986.0,
|
|
"reward": 0.6312500238418579,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.53125,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 56
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 333.0,
|
|
"completions/max_terminated_length": 333.0,
|
|
"completions/mean_length": 49.171875,
|
|
"completions/mean_terminated_length": 49.171875,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.07125,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 9.968092918395996,
|
|
"learning_rate": 8.622126023955445e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5589749.0,
|
|
"reward": 0.6765625476837158,
|
|
"reward_std": 0.06687791645526886,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.08768405020236969,
|
|
"rewards/mcq_exact_match_reward/mean": 0.578125,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 57
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 330.0,
|
|
"completions/max_terminated_length": 330.0,
|
|
"completions/mean_length": 45.828125,
|
|
"completions/mean_terminated_length": 45.828125,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0725,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 1.7135562896728516,
|
|
"learning_rate": 8.564642241456986e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5676570.0,
|
|
"reward": 0.6156250238418579,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 58
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 458.0,
|
|
"completions/max_terminated_length": 458.0,
|
|
"completions/mean_length": 39.21875,
|
|
"completions/mean_terminated_length": 39.21875,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.07375,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 17.85677146911621,
|
|
"learning_rate": 8.506183921362442e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5756528.0,
|
|
"reward": 0.7406250238418579,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 59
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 186.0,
|
|
"completions/max_terminated_length": 186.0,
|
|
"completions/mean_length": 22.78125,
|
|
"completions/mean_terminated_length": 22.78125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.075,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 33.40819549560547,
|
|
"learning_rate": 8.446767045592829e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5829058.0,
|
|
"reward": 0.7710937857627869,
|
|
"reward_std": 0.09984822571277618,
|
|
"rewards/format_reward/mean": 0.9921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.671875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 60
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 179.0,
|
|
"completions/max_terminated_length": 179.0,
|
|
"completions/mean_length": 17.703125,
|
|
"completions/mean_terminated_length": 17.703125,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.07625,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 6.3193039894104,
|
|
"learning_rate": 8.386407858128706e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5903063.0,
|
|
"reward": 0.5062500238418579,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.40625,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 61
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 226.0,
|
|
"completions/max_terminated_length": 226.0,
|
|
"completions/mean_length": 33.96875,
|
|
"completions/mean_terminated_length": 33.96875,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0775,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.325122860569241e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5984173.0,
|
|
"reward": 0.9750000238418579,
|
|
"reward_std": 0.0,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.875,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 62
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1581.0,
|
|
"completions/max_terminated_length": 1581.0,
|
|
"completions/mean_length": 96.734375,
|
|
"completions/mean_terminated_length": 96.734375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.07875,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 11.058859825134277,
|
|
"learning_rate": 8.262928807620843e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6062100.0,
|
|
"reward": 0.6296875476837158,
|
|
"reward_std": 0.15962307155132294,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.08768405020236969,
|
|
"rewards/mcq_exact_match_reward/mean": 0.53125,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 63
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 466.0,
|
|
"completions/max_terminated_length": 466.0,
|
|
"completions/mean_length": 34.109375,
|
|
"completions/mean_terminated_length": 34.109375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.08,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 6.3704962730407715,
|
|
"learning_rate": 8.199842702516582e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6133635.0,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 64
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 302.0,
|
|
"completions/max_terminated_length": 302.0,
|
|
"completions/mean_length": 30.375,
|
|
"completions/mean_terminated_length": 30.375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.08125,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.135881792367685e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6218427.0,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.0,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 65
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 389.0,
|
|
"completions/max_terminated_length": 389.0,
|
|
"completions/mean_length": 44.90625,
|
|
"completions/mean_terminated_length": 44.90625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0825,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 6.963454723358154,
|
|
"learning_rate": 8.071063563448339e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6304541.0,
|
|
"reward": 0.9117187857627869,
|
|
"reward_std": 0.06768143177032471,
|
|
"rewards/format_reward/mean": 0.9921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.8125,
|
|
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
|
|
"step": 66
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 115.0,
|
|
"completions/max_terminated_length": 115.0,
|
|
"completions/mean_length": 15.109375,
|
|
"completions/mean_terminated_length": 15.109375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.08375,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 10.610746383666992,
|
|
"learning_rate": 8.005405736415125e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6382612.0,
|
|
"reward": 0.6929687857627869,
|
|
"reward_std": 0.09059805423021317,
|
|
"rewards/format_reward/mean": 0.9921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.59375,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 67
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 161.0,
|
|
"completions/max_terminated_length": 161.0,
|
|
"completions/mean_length": 21.765625,
|
|
"completions/mean_terminated_length": 21.765625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.085,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 12.212800979614258,
|
|
"learning_rate": 7.938926261462365e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6446757.0,
|
|
"reward": 0.7875000238418579,
|
|
"reward_std": 0.06681530922651291,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.6875,
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
|
"step": 68
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 312.0,
|
|
"completions/mean_length": 71.140625,
|
|
"completions/mean_terminated_length": 39.761905670166016,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.08625,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 9.373642921447754,
|
|
"learning_rate": 7.871643313414718e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6524198.0,
|
|
"reward": 1.0046875476837158,
|
|
"reward_std": 0.11330723762512207,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.125,
|
|
"rewards/mcq_exact_match_reward/mean": 0.90625,
|
|
"rewards/mcq_exact_match_reward/std": 0.29378482699394226,
|
|
"step": 69
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 222.0,
|
|
"completions/max_terminated_length": 222.0,
|
|
"completions/mean_length": 27.703125,
|
|
"completions/mean_terminated_length": 27.703125,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0875,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 14.864502906799316,
|
|
"learning_rate": 7.803575286758363e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6599307.0,
|
|
"reward": 0.8187500238418579,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.71875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 70
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 199.0,
|
|
"completions/max_terminated_length": 199.0,
|
|
"completions/mean_length": 32.71875,
|
|
"completions/mean_terminated_length": 32.71875,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.08875,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 1.8891595602035522,
|
|
"learning_rate": 7.734740790612136e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6682921.0,
|
|
"reward": 0.7093750238418579,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 71
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 556.0,
|
|
"completions/max_terminated_length": 556.0,
|
|
"completions/mean_length": 27.71875,
|
|
"completions/mean_terminated_length": 27.71875,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.09,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 3.046689033508301,
|
|
"learning_rate": 7.665158643639969e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6767743.0,
|
|
"reward": 0.8492187857627869,
|
|
"reward_std": 0.00220970856025815,
|
|
"rewards/format_reward/mean": 0.9921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 72
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 260.0,
|
|
"completions/max_terminated_length": 260.0,
|
|
"completions/mean_length": 22.734375,
|
|
"completions/mean_terminated_length": 22.734375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.09125,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 7.001441955566406,
|
|
"learning_rate": 7.594847868906076e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6864566.0,
|
|
"reward": 0.8968750238418579,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.796875,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 73
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 12.0625,
|
|
"completions/mean_terminated_length": 12.0625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0925,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 32.40105056762695,
|
|
"learning_rate": 7.523827688674219e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6921450.0,
|
|
"reward": 0.7718750238418579,
|
|
"reward_std": 0.11100947856903076,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.671875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 74
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 179.0,
|
|
"completions/max_terminated_length": 179.0,
|
|
"completions/mean_length": 14.734375,
|
|
"completions/mean_terminated_length": 14.734375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.09375,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 17.64237403869629,
|
|
"learning_rate": 7.452117519152541e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7008801.0,
|
|
"reward": 0.7093750238418579,
|
|
"reward_std": 0.189372718334198,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 75
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 148.0,
|
|
"completions/max_terminated_length": 148.0,
|
|
"completions/mean_length": 15.921875,
|
|
"completions/mean_terminated_length": 15.921875,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.095,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 17.417652130126953,
|
|
"learning_rate": 7.379736965185368e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 7089380.0,
|
|
"reward": 0.4281250238418579,
|
|
"reward_std": 0.1530819982290268,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.328125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 76
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 290.0,
|
|
"completions/max_terminated_length": 290.0,
|
|
"completions/mean_length": 16.765625,
|
|
"completions/mean_terminated_length": 16.765625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.09625,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 15.111842155456543,
|
|
"learning_rate": 7.306705814893439e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7194789.0,
|
|
"reward": 0.45781251788139343,
|
|
"reward_std": 0.04861358925700188,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.125,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 77
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 12.03125,
|
|
"completions/mean_terminated_length": 12.03125,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.0975,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 30.111845016479492,
|
|
"learning_rate": 7.233044034264033e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7284127.0,
|
|
"reward": 0.2718750238418579,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.171875,
|
|
"rewards/mcq_exact_match_reward/std": 0.38025420904159546,
|
|
"step": 78
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 12.0625,
|
|
"completions/mean_terminated_length": 12.0625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.09875,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 35.65519714355469,
|
|
"learning_rate": 7.158771761692464e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7379515.0,
|
|
"reward": 0.8187500238418579,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.71875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 79
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 66.0,
|
|
"completions/max_terminated_length": 66.0,
|
|
"completions/mean_length": 12.90625,
|
|
"completions/mean_terminated_length": 12.90625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 20.51276206970215,
|
|
"learning_rate": 7.083909302476452e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7446461.0,
|
|
"reward": 0.6781250238418579,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.578125,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 80
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 12.09375,
|
|
"completions/mean_terminated_length": 12.09375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.10125,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.008477123264847e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7561899.0,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.0,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 81
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 557.0,
|
|
"completions/max_terminated_length": 557.0,
|
|
"completions/mean_length": 20.515625,
|
|
"completions/mean_terminated_length": 20.515625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1025,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.932495846462261e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7644644.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.0,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 82
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 12.015625,
|
|
"completions/mean_terminated_length": 12.015625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.10375,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 6.890464782714844,
|
|
"learning_rate": 6.855986244591103e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 7729413.0,
|
|
"reward": 0.6156250238418579,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 83
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 12.140625,
|
|
"completions/mean_terminated_length": 12.140625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.105,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 19.9100341796875,
|
|
"learning_rate": 6.778969234612583e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 7815534.0,
|
|
"reward": 0.7906250357627869,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/format_reward/mean": 0.875,
|
|
"rewards/format_reward/std": 0.3333333432674408,
|
|
"rewards/mcq_exact_match_reward/mean": 0.703125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 84
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 12.0625,
|
|
"completions/mean_terminated_length": 12.0625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.10625,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 18.460962295532227,
|
|
"learning_rate": 6.701465872208216e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7896298.0,
|
|
"reward": 0.7875000238418579,
|
|
"reward_std": 0.06681530922651291,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.6875,
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
|
"step": 85
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 12.03125,
|
|
"completions/mean_terminated_length": 12.03125,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1075,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 30.060848236083984,
|
|
"learning_rate": 6.623497346023417e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7974052.0,
|
|
"reward": 0.8031250238418579,
|
|
"reward_std": 0.1530819982290268,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.703125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 86
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 101.0,
|
|
"completions/max_terminated_length": 101.0,
|
|
"completions/mean_length": 13.453125,
|
|
"completions/mean_terminated_length": 13.453125,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.10875,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 20.207021713256836,
|
|
"learning_rate": 6.545084971874736e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8049993.0,
|
|
"reward": 0.7562500238418579,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.65625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 87
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 54.0,
|
|
"completions/max_terminated_length": 54.0,
|
|
"completions/mean_length": 12.65625,
|
|
"completions/mean_terminated_length": 12.65625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.11,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 18.510467529296875,
|
|
"learning_rate": 6.466250186922324e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 8119723.0,
|
|
"reward": 0.6625000238418579,
|
|
"reward_std": 0.06681530922651291,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 88
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 12.046875,
|
|
"completions/mean_terminated_length": 12.046875,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.11125,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 26.590150833129883,
|
|
"learning_rate": 6.387014543809223e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8201878.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.13363061845302582,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 89
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 444.0,
|
|
"completions/max_terminated_length": 444.0,
|
|
"completions/mean_length": 22.1875,
|
|
"completions/mean_terminated_length": 22.1875,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1125,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 1.3266140222549438,
|
|
"learning_rate": 6.307399704769098e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 8289162.0,
|
|
"reward": 0.8492187261581421,
|
|
"reward_std": 0.00220970856025815,
|
|
"rewards/format_reward/mean": 0.9921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 90
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 12.03125,
|
|
"completions/mean_terminated_length": 12.03125,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.11375,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 27.705215454101562,
|
|
"learning_rate": 6.227427435703995e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 8378164.0,
|
|
"reward": 0.4828125238418579,
|
|
"reward_std": 0.12902677059173584,
|
|
"rewards/format_reward/mean": 0.921875,
|
|
"rewards/format_reward/std": 0.27048972249031067,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 91
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 12.0625,
|
|
"completions/mean_terminated_length": 12.0625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.115,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 25.083688735961914,
|
|
"learning_rate": 6.147119600233758e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 8471760.0,
|
|
"reward": 0.9109375476837158,
|
|
"reward_std": 0.11871248483657837,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.125,
|
|
"rewards/mcq_exact_match_reward/mean": 0.8125,
|
|
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
|
|
"step": 92
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 12.015625,
|
|
"completions/mean_terminated_length": 12.015625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.11625,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 22.57097053527832,
|
|
"learning_rate": 6.066498153718734e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8532233.0,
|
|
"reward": 0.6781250238418579,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.578125,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 93
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1175,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 21.33436393737793,
|
|
"learning_rate": 5.985585137257401e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8625297.0,
|
|
"reward": 0.6156250238418579,
|
|
"reward_std": 0.189372718334198,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 94
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.11875,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.90440267166055e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8693017.0,
|
|
"reward": 0.9750000238418579,
|
|
"reward_std": 0.0,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.875,
|
|
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
|
|
"step": 95
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.12,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 17.82723617553711,
|
|
"learning_rate": 5.82297295140367e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8762857.0,
|
|
"reward": 0.5062500238418579,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.40625,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 96
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.12125,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 12.498894691467285,
|
|
"learning_rate": 5.741318238559209e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8837825.0,
|
|
"reward": 0.7406250238418579,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 97
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 14.0,
|
|
"completions/max_terminated_length": 14.0,
|
|
"completions/mean_length": 12.0625,
|
|
"completions/mean_terminated_length": 12.0625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1225,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 7.857953071594238,
|
|
"learning_rate": 5.659460856710345e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8930893.0,
|
|
"reward": 0.5984375476837158,
|
|
"reward_std": 0.0044194171205163,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.125,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 98
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 20.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 12.140625,
|
|
"completions/mean_terminated_length": 12.140625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.12375,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 16.862316131591797,
|
|
"learning_rate": 5.577423184847931e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 9008550.0,
|
|
"reward": 0.5531250238418579,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 99
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 12.0,
|
|
"completions/mean_terminated_length": 12.0,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.125,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.495227651252315e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9088822.0,
|
|
"reward": 0.7250000238418579,
|
|
"reward_std": 0.0,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 100
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 200,
|
|
"num_input_tokens_seen": 9088822,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 50,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|