1385 lines
50 KiB
JSON
1385 lines
50 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 0.125,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 50,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.03125,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 1464.0,
|
||
|
|
"completions/mean_length": 477.234375,
|
||
|
|
"completions/mean_terminated_length": 426.56451416015625,
|
||
|
|
"completions/min_length": 3.0,
|
||
|
|
"completions/min_terminated_length": 3.0,
|
||
|
|
"epoch": 0.0025,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"grad_norm": 9.820798873901367,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 128463.0,
|
||
|
|
"reward": 0.30078125,
|
||
|
|
"reward_std": 0.2949070334434509,
|
||
|
|
"rewards/format_reward/mean": 0.3515625,
|
||
|
|
"rewards/format_reward/std": 0.3294980227947235,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
||
|
|
"step": 1
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1324.0,
|
||
|
|
"completions/max_terminated_length": 1324.0,
|
||
|
|
"completions/mean_length": 549.921875,
|
||
|
|
"completions/mean_terminated_length": 549.921875,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.005,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"grad_norm": 5.992785453796387,
|
||
|
|
"learning_rate": 2e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 224890.0,
|
||
|
|
"reward": 0.2890625,
|
||
|
|
"reward_std": 0.39714252948760986,
|
||
|
|
"rewards/format_reward/mean": 0.390625,
|
||
|
|
"rewards/format_reward/std": 0.301698237657547,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.015625,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 1812.0,
|
||
|
|
"completions/mean_length": 556.71875,
|
||
|
|
"completions/mean_terminated_length": 533.0476684570312,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0075,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"grad_norm": 6.250213146209717,
|
||
|
|
"learning_rate": 4e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 361720.0,
|
||
|
|
"reward": 0.1874999850988388,
|
||
|
|
"reward_std": 0.3287465274333954,
|
||
|
|
"rewards/format_reward/mean": 0.3125,
|
||
|
|
"rewards/format_reward/std": 0.2597312331199646,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.15625,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.36596253514289856,
|
||
|
|
"step": 3
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 2010.0,
|
||
|
|
"completions/max_terminated_length": 2010.0,
|
||
|
|
"completions/mean_length": 388.546875,
|
||
|
|
"completions/mean_terminated_length": 388.546875,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.01,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"grad_norm": 7.673406600952148,
|
||
|
|
"learning_rate": 6e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 469803.0,
|
||
|
|
"reward": 0.41718748211860657,
|
||
|
|
"reward_std": 0.42704811692237854,
|
||
|
|
"rewards/format_reward/mean": 0.421875,
|
||
|
|
"rewards/format_reward/std": 0.23935678601264954,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1047.0,
|
||
|
|
"completions/max_terminated_length": 1047.0,
|
||
|
|
"completions/mean_length": 299.203125,
|
||
|
|
"completions/mean_terminated_length": 299.203125,
|
||
|
|
"completions/min_length": 5.0,
|
||
|
|
"completions/min_terminated_length": 5.0,
|
||
|
|
"epoch": 0.0125,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"grad_norm": 9.532109260559082,
|
||
|
|
"learning_rate": 8e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 581328.0,
|
||
|
|
"reward": 0.32656246423721313,
|
||
|
|
"reward_std": 0.25986582040786743,
|
||
|
|
"rewards/format_reward/mean": 0.296875,
|
||
|
|
"rewards/format_reward/std": 0.3177144229412079,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1405.0,
|
||
|
|
"completions/max_terminated_length": 1405.0,
|
||
|
|
"completions/mean_length": 549.515625,
|
||
|
|
"completions/mean_terminated_length": 549.515625,
|
||
|
|
"completions/min_length": 19.0,
|
||
|
|
"completions/min_terminated_length": 19.0,
|
||
|
|
"epoch": 0.015,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"grad_norm": 6.551382064819336,
|
||
|
|
"learning_rate": 1e-06,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 695497.0,
|
||
|
|
"reward": 0.25078123807907104,
|
||
|
|
"reward_std": 0.3316580057144165,
|
||
|
|
"rewards/format_reward/mean": 0.4765625,
|
||
|
|
"rewards/format_reward/std": 0.28770697116851807,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.03125,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 1751.0,
|
||
|
|
"completions/mean_length": 496.078125,
|
||
|
|
"completions/mean_terminated_length": 446.01611328125,
|
||
|
|
"completions/min_length": 2.0,
|
||
|
|
"completions/min_terminated_length": 2.0,
|
||
|
|
"epoch": 0.0175,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"grad_norm": 14.735747337341309,
|
||
|
|
"learning_rate": 9.99726628670463e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 819054.0,
|
||
|
|
"reward": 0.41015625,
|
||
|
|
"reward_std": 0.42701074481010437,
|
||
|
|
"rewards/format_reward/mean": 0.3515625,
|
||
|
|
"rewards/format_reward/std": 0.24688033759593964,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
||
|
|
"step": 7
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1369.0,
|
||
|
|
"completions/max_terminated_length": 1369.0,
|
||
|
|
"completions/mean_length": 461.125,
|
||
|
|
"completions/mean_terminated_length": 461.125,
|
||
|
|
"completions/min_length": 8.0,
|
||
|
|
"completions/min_terminated_length": 8.0,
|
||
|
|
"epoch": 0.02,
|
||
|
|
"frac_reward_zero_std": 0.125,
|
||
|
|
"grad_norm": 2.3624932765960693,
|
||
|
|
"learning_rate": 9.989068136093872e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 918694.0,
|
||
|
|
"reward": 0.42656248807907104,
|
||
|
|
"reward_std": 0.3252020478248596,
|
||
|
|
"rewards/format_reward/mean": 0.359375,
|
||
|
|
"rewards/format_reward/std": 0.2741328477859497,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.046875,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 1707.0,
|
||
|
|
"completions/mean_length": 575.140625,
|
||
|
|
"completions/mean_terminated_length": 502.70489501953125,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0225,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"grad_norm": 5.942022800445557,
|
||
|
|
"learning_rate": 9.975414512725056e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 1059111.0,
|
||
|
|
"reward": 0.32343748211860657,
|
||
|
|
"reward_std": 0.38642236590385437,
|
||
|
|
"rewards/format_reward/mean": 0.421875,
|
||
|
|
"rewards/format_reward/std": 0.18298126757144928,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
||
|
|
"step": 9
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.015625,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 1615.0,
|
||
|
|
"completions/mean_length": 552.0625,
|
||
|
|
"completions/mean_terminated_length": 528.3175048828125,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.025,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"grad_norm": 10.24199390411377,
|
||
|
|
"learning_rate": 9.956320346634875e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 1199859.0,
|
||
|
|
"reward": 0.29140621423721313,
|
||
|
|
"reward_std": 0.28808674216270447,
|
||
|
|
"rewards/format_reward/mean": 0.4140625,
|
||
|
|
"rewards/format_reward/std": 0.209963858127594,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.015625,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 971.0,
|
||
|
|
"completions/mean_length": 323.359375,
|
||
|
|
"completions/mean_terminated_length": 295.984130859375,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0275,
|
||
|
|
"frac_reward_zero_std": 0.125,
|
||
|
|
"grad_norm": 11.983031272888184,
|
||
|
|
"learning_rate": 9.931806517013612e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 1308242.0,
|
||
|
|
"reward": 0.24374999105930328,
|
||
|
|
"reward_std": 0.26579102873802185,
|
||
|
|
"rewards/format_reward/mean": 0.40625,
|
||
|
|
"rewards/format_reward/std": 0.19669894874095917,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
||
|
|
"step": 11
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1475.0,
|
||
|
|
"completions/max_terminated_length": 1475.0,
|
||
|
|
"completions/mean_length": 494.640625,
|
||
|
|
"completions/mean_terminated_length": 494.640625,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.03,
|
||
|
|
"frac_reward_zero_std": 0.125,
|
||
|
|
"grad_norm": 13.534098625183105,
|
||
|
|
"learning_rate": 9.901899829374047e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 1427139.0,
|
||
|
|
"reward": 0.453125,
|
||
|
|
"reward_std": 0.37656593322753906,
|
||
|
|
"rewards/format_reward/mean": 0.46875,
|
||
|
|
"rewards/format_reward/std": 0.21593283116817474,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.40625,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1420.0,
|
||
|
|
"completions/max_terminated_length": 1420.0,
|
||
|
|
"completions/mean_length": 591.890625,
|
||
|
|
"completions/mean_terminated_length": 591.890625,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0325,
|
||
|
|
"frac_reward_zero_std": 0.125,
|
||
|
|
"grad_norm": 3.6356875896453857,
|
||
|
|
"learning_rate": 9.866632986240029e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 1555604.0,
|
||
|
|
"reward": 0.24765624105930328,
|
||
|
|
"reward_std": 0.3406350612640381,
|
||
|
|
"rewards/format_reward/mean": 0.4453125,
|
||
|
|
"rewards/format_reward/std": 0.2538151443004608,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
||
|
|
"step": 13
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1100.0,
|
||
|
|
"completions/max_terminated_length": 1100.0,
|
||
|
|
"completions/mean_length": 327.9375,
|
||
|
|
"completions/mean_terminated_length": 327.9375,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.035,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"grad_norm": 19.802650451660156,
|
||
|
|
"learning_rate": 9.826044551386742e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 1663176.0,
|
||
|
|
"reward": 0.30781248211860657,
|
||
|
|
"reward_std": 0.3470980226993561,
|
||
|
|
"rewards/format_reward/mean": 0.421875,
|
||
|
|
"rewards/format_reward/std": 0.20351573824882507,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.015625,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 1263.0,
|
||
|
|
"completions/mean_length": 416.265625,
|
||
|
|
"completions/mean_terminated_length": 390.3651123046875,
|
||
|
|
"completions/min_length": 3.0,
|
||
|
|
"completions/min_terminated_length": 3.0,
|
||
|
|
"epoch": 0.0375,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"grad_norm": 10.966890335083008,
|
||
|
|
"learning_rate": 9.780178907671788e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 1757385.0,
|
||
|
|
"reward": 0.3671875,
|
||
|
|
"reward_std": 0.37327155470848083,
|
||
|
|
"rewards/format_reward/mean": 0.546875,
|
||
|
|
"rewards/format_reward/std": 0.3299681544303894,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.3125,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.015625,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 1576.0,
|
||
|
|
"completions/mean_length": 447.671875,
|
||
|
|
"completions/mean_terminated_length": 422.2698669433594,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.04,
|
||
|
|
"frac_reward_zero_std": 0.125,
|
||
|
|
"grad_norm": 10.119935989379883,
|
||
|
|
"learning_rate": 9.729086208503173e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 1892308.0,
|
||
|
|
"reward": 0.5914062261581421,
|
||
|
|
"reward_std": 0.31337296962738037,
|
||
|
|
"rewards/format_reward/mean": 0.4453125,
|
||
|
|
"rewards/format_reward/std": 0.15728822350502014,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.546875,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1743.0,
|
||
|
|
"completions/max_terminated_length": 1743.0,
|
||
|
|
"completions/mean_length": 421.59375,
|
||
|
|
"completions/mean_terminated_length": 421.59375,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0425,
|
||
|
|
"frac_reward_zero_std": 0.125,
|
||
|
|
"grad_norm": 10.146256446838379,
|
||
|
|
"learning_rate": 9.672822322997304e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 2011562.0,
|
||
|
|
"reward": 0.29140621423721313,
|
||
|
|
"reward_std": 0.3035487234592438,
|
||
|
|
"rewards/format_reward/mean": 0.4140625,
|
||
|
|
"rewards/format_reward/std": 0.209963858127594,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
||
|
|
"step": 17
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1499.0,
|
||
|
|
"completions/max_terminated_length": 1499.0,
|
||
|
|
"completions/mean_length": 368.15625,
|
||
|
|
"completions/mean_terminated_length": 368.15625,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.045,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"grad_norm": 28.832923889160156,
|
||
|
|
"learning_rate": 9.611448774886923e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 2107684.0,
|
||
|
|
"reward": 0.31328123807907104,
|
||
|
|
"reward_std": 0.32075032591819763,
|
||
|
|
"rewards/format_reward/mean": 0.4765625,
|
||
|
|
"rewards/format_reward/std": 0.22589658200740814,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1508.0,
|
||
|
|
"completions/max_terminated_length": 1508.0,
|
||
|
|
"completions/mean_length": 311.4375,
|
||
|
|
"completions/mean_terminated_length": 311.4375,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0475,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"grad_norm": 17.548686981201172,
|
||
|
|
"learning_rate": 9.545032675245813e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 2220144.0,
|
||
|
|
"reward": 0.5015624761581421,
|
||
|
|
"reward_std": 0.41764065623283386,
|
||
|
|
"rewards/format_reward/mean": 0.484375,
|
||
|
|
"rewards/format_reward/std": 0.08768405020236969,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
||
|
|
"step": 19
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.015625,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 1022.0,
|
||
|
|
"completions/mean_length": 172.484375,
|
||
|
|
"completions/mean_terminated_length": 142.71429443359375,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.05,
|
||
|
|
"frac_reward_zero_std": 0.5,
|
||
|
|
"grad_norm": 16.76396942138672,
|
||
|
|
"learning_rate": 9.473646649103817e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 2318791.0,
|
||
|
|
"reward": 0.5539062023162842,
|
||
|
|
"reward_std": 0.17947588860988617,
|
||
|
|
"rewards/format_reward/mean": 0.5390625,
|
||
|
|
"rewards/format_reward/std": 0.16194961965084076,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1462.0,
|
||
|
|
"completions/max_terminated_length": 1462.0,
|
||
|
|
"completions/mean_length": 254.03125,
|
||
|
|
"completions/mean_terminated_length": 254.03125,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0525,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 12.80144214630127,
|
||
|
|
"learning_rate": 9.397368756032444e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 2429865.0,
|
||
|
|
"reward": 0.25312498211860657,
|
||
|
|
"reward_std": 0.33632034063339233,
|
||
|
|
"rewards/format_reward/mean": 0.5,
|
||
|
|
"rewards/format_reward/std": 0.1259881556034088,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
||
|
|
"step": 21
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 954.0,
|
||
|
|
"completions/max_terminated_length": 954.0,
|
||
|
|
"completions/mean_length": 104.65625,
|
||
|
|
"completions/mean_terminated_length": 104.65625,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.055,
|
||
|
|
"frac_reward_zero_std": 0.625,
|
||
|
|
"grad_norm": 10.90204906463623,
|
||
|
|
"learning_rate": 9.316282404787869e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 2523307.0,
|
||
|
|
"reward": 0.4093749523162842,
|
||
|
|
"reward_std": 0.17358146607875824,
|
||
|
|
"rewards/format_reward/mean": 0.5,
|
||
|
|
"rewards/format_reward/std": 0.0,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1322.0,
|
||
|
|
"completions/max_terminated_length": 1322.0,
|
||
|
|
"completions/mean_length": 377.9375,
|
||
|
|
"completions/mean_terminated_length": 377.9375,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0575,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 11.154988288879395,
|
||
|
|
"learning_rate": 9.230476262104676e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 2626679.0,
|
||
|
|
"reward": 0.22343748807907104,
|
||
|
|
"reward_std": 0.28182199597358704,
|
||
|
|
"rewards/format_reward/mean": 0.515625,
|
||
|
|
"rewards/format_reward/std": 0.1534975916147232,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.171875,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.38025420904159546,
|
||
|
|
"step": 23
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 398.0,
|
||
|
|
"completions/max_terminated_length": 398.0,
|
||
|
|
"completions/mean_length": 12.125,
|
||
|
|
"completions/mean_terminated_length": 12.125,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.06,
|
||
|
|
"frac_reward_zero_std": 0.375,
|
||
|
|
"grad_norm": 11.924612045288086,
|
||
|
|
"learning_rate": 9.1400441557401e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 2733791.0,
|
||
|
|
"reward": 0.4718749523162842,
|
||
|
|
"reward_std": 0.24831001460552216,
|
||
|
|
"rewards/format_reward/mean": 0.5,
|
||
|
|
"rewards/format_reward/std": 0.0,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.421875,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 979.0,
|
||
|
|
"completions/max_terminated_length": 979.0,
|
||
|
|
"completions/mean_length": 173.625,
|
||
|
|
"completions/mean_terminated_length": 173.625,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0625,
|
||
|
|
"frac_reward_zero_std": 0.375,
|
||
|
|
"grad_norm": 17.104806900024414,
|
||
|
|
"learning_rate": 9.045084971874737e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 2827215.0,
|
||
|
|
"reward": 0.51953125,
|
||
|
|
"reward_std": 0.22175219655036926,
|
||
|
|
"rewards/format_reward/mean": 0.5078125,
|
||
|
|
"rewards/format_reward/std": 0.1406387835741043,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1075.0,
|
||
|
|
"completions/max_terminated_length": 1075.0,
|
||
|
|
"completions/mean_length": 161.796875,
|
||
|
|
"completions/mean_terminated_length": 161.796875,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.065,
|
||
|
|
"frac_reward_zero_std": 0.375,
|
||
|
|
"grad_norm": 24.93585968017578,
|
||
|
|
"learning_rate": 8.945702546981968e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 2902970.0,
|
||
|
|
"reward": 0.40859371423721313,
|
||
|
|
"reward_std": 0.22210699319839478,
|
||
|
|
"rewards/format_reward/mean": 0.4921875,
|
||
|
|
"rewards/format_reward/std": 0.0625,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 928.0,
|
||
|
|
"completions/max_terminated_length": 928.0,
|
||
|
|
"completions/mean_length": 235.765625,
|
||
|
|
"completions/mean_terminated_length": 235.765625,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0675,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 16.02617645263672,
|
||
|
|
"learning_rate": 8.842005554284295e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 3005379.0,
|
||
|
|
"reward": 0.29999998211860657,
|
||
|
|
"reward_std": 0.2879316806793213,
|
||
|
|
"rewards/format_reward/mean": 0.5,
|
||
|
|
"rewards/format_reward/std": 0.08908708393573761,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
||
|
|
"step": 27
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 935.0,
|
||
|
|
"completions/max_terminated_length": 935.0,
|
||
|
|
"completions/mean_length": 141.0,
|
||
|
|
"completions/mean_terminated_length": 141.0,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.07,
|
||
|
|
"frac_reward_zero_std": 0.125,
|
||
|
|
"grad_norm": 29.071575164794922,
|
||
|
|
"learning_rate": 8.734107384920769e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 3102595.0,
|
||
|
|
"reward": 0.4562499523162842,
|
||
|
|
"reward_std": 0.3846532702445984,
|
||
|
|
"rewards/format_reward/mean": 0.5,
|
||
|
|
"rewards/format_reward/std": 0.08908708393573761,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.40625,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 961.0,
|
||
|
|
"completions/max_terminated_length": 961.0,
|
||
|
|
"completions/mean_length": 109.046875,
|
||
|
|
"completions/mean_terminated_length": 109.046875,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0725,
|
||
|
|
"frac_reward_zero_std": 0.5,
|
||
|
|
"grad_norm": 14.398436546325684,
|
||
|
|
"learning_rate": 8.622126023955445e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 3190334.0,
|
||
|
|
"reward": 0.6898437142372131,
|
||
|
|
"reward_std": 0.19096830487251282,
|
||
|
|
"rewards/format_reward/mean": 0.4921875,
|
||
|
|
"rewards/format_reward/std": 0.0625,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
||
|
|
"step": 29
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 2034.0,
|
||
|
|
"completions/max_terminated_length": 2034.0,
|
||
|
|
"completions/mean_length": 313.390625,
|
||
|
|
"completions/mean_terminated_length": 313.390625,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.075,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 23.27433967590332,
|
||
|
|
"learning_rate": 8.506183921362442e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 3301191.0,
|
||
|
|
"reward": 0.6148437261581421,
|
||
|
|
"reward_std": 0.3767889142036438,
|
||
|
|
"rewards/format_reward/mean": 0.5234375,
|
||
|
|
"rewards/format_reward/std": 0.13886408507823944,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.5625,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1409.0,
|
||
|
|
"completions/max_terminated_length": 1409.0,
|
||
|
|
"completions/mean_length": 302.03125,
|
||
|
|
"completions/mean_terminated_length": 302.03125,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0775,
|
||
|
|
"frac_reward_zero_std": 0.375,
|
||
|
|
"grad_norm": 9.451353073120117,
|
||
|
|
"learning_rate": 8.386407858128706e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 3431513.0,
|
||
|
|
"reward": 0.3273437023162842,
|
||
|
|
"reward_std": 0.24055621027946472,
|
||
|
|
"rewards/format_reward/mean": 0.4609375,
|
||
|
|
"rewards/format_reward/std": 0.16194961965084076,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
||
|
|
"step": 31
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1716.0,
|
||
|
|
"completions/max_terminated_length": 1716.0,
|
||
|
|
"completions/mean_length": 140.75,
|
||
|
|
"completions/mean_terminated_length": 140.75,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.08,
|
||
|
|
"frac_reward_zero_std": 0.5,
|
||
|
|
"grad_norm": 19.545513153076172,
|
||
|
|
"learning_rate": 8.262928807620843e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 3516601.0,
|
||
|
|
"reward": 0.42890626192092896,
|
||
|
|
"reward_std": 0.0956839770078659,
|
||
|
|
"rewards/format_reward/mean": 0.5390625,
|
||
|
|
"rewards/format_reward/std": 0.2236899733543396,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 333.0,
|
||
|
|
"completions/max_terminated_length": 333.0,
|
||
|
|
"completions/mean_length": 26.1875,
|
||
|
|
"completions/mean_terminated_length": 26.1875,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0825,
|
||
|
|
"frac_reward_zero_std": 0.5,
|
||
|
|
"grad_norm": 11.879921913146973,
|
||
|
|
"learning_rate": 8.135881792367685e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 3610013.0,
|
||
|
|
"reward": 0.7226561903953552,
|
||
|
|
"reward_std": 0.19158241152763367,
|
||
|
|
"rewards/format_reward/mean": 0.5078125,
|
||
|
|
"rewards/format_reward/std": 0.0625,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.671875,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
||
|
|
"step": 33
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1395.0,
|
||
|
|
"completions/max_terminated_length": 1395.0,
|
||
|
|
"completions/mean_length": 130.375,
|
||
|
|
"completions/mean_terminated_length": 130.375,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.085,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 22.375093460083008,
|
||
|
|
"learning_rate": 8.005405736415125e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 3706909.0,
|
||
|
|
"reward": 0.4156249761581421,
|
||
|
|
"reward_std": 0.34389790892601013,
|
||
|
|
"rewards/format_reward/mean": 0.5625,
|
||
|
|
"rewards/format_reward/std": 0.18898223340511322,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.03125,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 1169.0,
|
||
|
|
"completions/mean_length": 228.734375,
|
||
|
|
"completions/mean_terminated_length": 170.0483856201172,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0875,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 17.84543228149414,
|
||
|
|
"learning_rate": 7.871643313414718e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 3815884.0,
|
||
|
|
"reward": 0.7546874284744263,
|
||
|
|
"reward_std": 0.28800931572914124,
|
||
|
|
"rewards/format_reward/mean": 0.515625,
|
||
|
|
"rewards/format_reward/std": 0.1534975916147232,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.703125,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 311.0,
|
||
|
|
"completions/max_terminated_length": 311.0,
|
||
|
|
"completions/mean_length": 27.609375,
|
||
|
|
"completions/mean_terminated_length": 27.609375,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.09,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 31.98973274230957,
|
||
|
|
"learning_rate": 7.734740790612136e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 3941163.0,
|
||
|
|
"reward": 0.4898437261581421,
|
||
|
|
"reward_std": 0.2993735373020172,
|
||
|
|
"rewards/format_reward/mean": 0.5234375,
|
||
|
|
"rewards/format_reward/std": 0.10652101784944534,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.4375,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1835.0,
|
||
|
|
"completions/max_terminated_length": 1835.0,
|
||
|
|
"completions/mean_length": 61.625,
|
||
|
|
"completions/mean_terminated_length": 61.625,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0925,
|
||
|
|
"frac_reward_zero_std": 0.5,
|
||
|
|
"grad_norm": 12.227835655212402,
|
||
|
|
"learning_rate": 7.594847868906076e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 4027779.0,
|
||
|
|
"reward": 0.528124988079071,
|
||
|
|
"reward_std": 0.1930253505706787,
|
||
|
|
"rewards/format_reward/mean": 0.4375,
|
||
|
|
"rewards/format_reward/std": 0.1666666716337204,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
||
|
|
"step": 37
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1583.0,
|
||
|
|
"completions/max_terminated_length": 1583.0,
|
||
|
|
"completions/mean_length": 239.546875,
|
||
|
|
"completions/mean_terminated_length": 239.546875,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.095,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 20.220304489135742,
|
||
|
|
"learning_rate": 7.452117519152541e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 4120606.0,
|
||
|
|
"reward": 0.39531248807907104,
|
||
|
|
"reward_std": 0.3611350357532501,
|
||
|
|
"rewards/format_reward/mean": 0.671875,
|
||
|
|
"rewards/format_reward/std": 0.2847827076911926,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.328125,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 926.0,
|
||
|
|
"completions/max_terminated_length": 926.0,
|
||
|
|
"completions/mean_length": 72.34375,
|
||
|
|
"completions/mean_terminated_length": 72.34375,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.0975,
|
||
|
|
"frac_reward_zero_std": 0.5,
|
||
|
|
"grad_norm": 21.585006713867188,
|
||
|
|
"learning_rate": 7.306705814893439e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 4210956.0,
|
||
|
|
"reward": 0.37812501192092896,
|
||
|
|
"reward_std": 0.23370197415351868,
|
||
|
|
"rewards/format_reward/mean": 0.65625,
|
||
|
|
"rewards/format_reward/std": 0.233588308095932,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.3125,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
||
|
|
"step": 39
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 1230.0,
|
||
|
|
"completions/max_terminated_length": 1230.0,
|
||
|
|
"completions/mean_length": 107.265625,
|
||
|
|
"completions/mean_terminated_length": 107.265625,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.1,
|
||
|
|
"frac_reward_zero_std": 0.375,
|
||
|
|
"grad_norm": 19.986061096191406,
|
||
|
|
"learning_rate": 7.158771761692464e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 4302349.0,
|
||
|
|
"reward": 0.28593748807907104,
|
||
|
|
"reward_std": 0.20377102494239807,
|
||
|
|
"rewards/format_reward/mean": 0.671875,
|
||
|
|
"rewards/format_reward/std": 0.23935678601264954,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 784.0,
|
||
|
|
"completions/max_terminated_length": 784.0,
|
||
|
|
"completions/mean_length": 170.578125,
|
||
|
|
"completions/mean_terminated_length": 170.578125,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.1025,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 23.84389305114746,
|
||
|
|
"learning_rate": 7.008477123264847e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 4402810.0,
|
||
|
|
"reward": 0.6890624761581421,
|
||
|
|
"reward_std": 0.3438800573348999,
|
||
|
|
"rewards/format_reward/mean": 0.640625,
|
||
|
|
"rewards/format_reward/std": 0.24346621334552765,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
||
|
|
"step": 41
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.046875,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 1099.0,
|
||
|
|
"completions/mean_length": 197.78125,
|
||
|
|
"completions/mean_terminated_length": 106.78688049316406,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.105,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 30.521644592285156,
|
||
|
|
"learning_rate": 6.855986244591103e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 4512004.0,
|
||
|
|
"reward": 0.5382812023162842,
|
||
|
|
"reward_std": 0.25127214193344116,
|
||
|
|
"rewards/format_reward/mean": 0.6953125,
|
||
|
|
"rewards/format_reward/std": 0.31644338369369507,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.015625,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 1161.0,
|
||
|
|
"completions/mean_length": 134.6875,
|
||
|
|
"completions/mean_terminated_length": 104.31746673583984,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.1075,
|
||
|
|
"frac_reward_zero_std": 0.125,
|
||
|
|
"grad_norm": 28.4834041595459,
|
||
|
|
"learning_rate": 6.701465872208216e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 4623384.0,
|
||
|
|
"reward": 0.2992187440395355,
|
||
|
|
"reward_std": 0.23480820655822754,
|
||
|
|
"rewards/format_reward/mean": 0.8046875,
|
||
|
|
"rewards/format_reward/std": 0.29028159379959106,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
||
|
|
"step": 43
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 715.0,
|
||
|
|
"completions/max_terminated_length": 715.0,
|
||
|
|
"completions/mean_length": 76.203125,
|
||
|
|
"completions/mean_terminated_length": 76.203125,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.11,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 14.868675231933594,
|
||
|
|
"learning_rate": 6.545084971874736e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 4706005.0,
|
||
|
|
"reward": 0.32734376192092896,
|
||
|
|
"reward_std": 0.24423527717590332,
|
||
|
|
"rewards/format_reward/mean": 0.9296875,
|
||
|
|
"rewards/format_reward/std": 0.1751912236213684,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.046875,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 668.0,
|
||
|
|
"completions/mean_length": 154.5625,
|
||
|
|
"completions/mean_terminated_length": 61.44261932373047,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.1125,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 15.19470500946045,
|
||
|
|
"learning_rate": 6.387014543809223e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 4798985.0,
|
||
|
|
"reward": 0.5421874523162842,
|
||
|
|
"reward_std": 0.3319449722766876,
|
||
|
|
"rewards/format_reward/mean": 0.734375,
|
||
|
|
"rewards/format_reward/std": 0.30820462107658386,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.015625,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 543.0,
|
||
|
|
"completions/mean_length": 72.453125,
|
||
|
|
"completions/mean_terminated_length": 41.09524154663086,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.115,
|
||
|
|
"frac_reward_zero_std": 0.5,
|
||
|
|
"grad_norm": 15.99255084991455,
|
||
|
|
"learning_rate": 6.227427435703995e-07,
|
||
|
|
"loss": -0.0,
|
||
|
|
"num_tokens": 4918510.0,
|
||
|
|
"reward": 0.31406253576278687,
|
||
|
|
"reward_std": 0.0530330091714859,
|
||
|
|
"rewards/format_reward/mean": 0.796875,
|
||
|
|
"rewards/format_reward/std": 0.3642643094062805,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.015625,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 608.0,
|
||
|
|
"completions/mean_length": 79.3125,
|
||
|
|
"completions/mean_terminated_length": 48.06349563598633,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.1175,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 18.731386184692383,
|
||
|
|
"learning_rate": 6.066498153718734e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 5018826.0,
|
||
|
|
"reward": 0.6578124761581421,
|
||
|
|
"reward_std": 0.29044055938720703,
|
||
|
|
"rewards/format_reward/mean": 0.796875,
|
||
|
|
"rewards/format_reward/std": 0.3177144229412079,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.578125,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
||
|
|
"step": 47
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 855.0,
|
||
|
|
"completions/max_terminated_length": 855.0,
|
||
|
|
"completions/mean_length": 66.71875,
|
||
|
|
"completions/mean_terminated_length": 66.71875,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.12,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 18.806800842285156,
|
||
|
|
"learning_rate": 5.90440267166055e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 5152376.0,
|
||
|
|
"reward": 0.3304687738418579,
|
||
|
|
"reward_std": 0.30621567368507385,
|
||
|
|
"rewards/format_reward/mean": 0.8046875,
|
||
|
|
"rewards/format_reward/std": 0.37392371892929077,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.015625,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 774.0,
|
||
|
|
"completions/mean_length": 102.953125,
|
||
|
|
"completions/mean_terminated_length": 72.0793685913086,
|
||
|
|
"completions/min_length": 6.0,
|
||
|
|
"completions/min_terminated_length": 6.0,
|
||
|
|
"epoch": 0.1225,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 34.07569885253906,
|
||
|
|
"learning_rate": 5.741318238559209e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 5259269.0,
|
||
|
|
"reward": 0.45234376192092896,
|
||
|
|
"reward_std": 0.2820115089416504,
|
||
|
|
"rewards/format_reward/mean": 0.7734375,
|
||
|
|
"rewards/format_reward/std": 0.2807259261608124,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
||
|
|
"step": 49
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.03125,
|
||
|
|
"completions/max_length": 2048.0,
|
||
|
|
"completions/max_terminated_length": 414.0,
|
||
|
|
"completions/mean_length": 127.9375,
|
||
|
|
"completions/mean_terminated_length": 66.0,
|
||
|
|
"completions/min_length": 11.0,
|
||
|
|
"completions/min_terminated_length": 11.0,
|
||
|
|
"epoch": 0.125,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"grad_norm": 27.46225929260254,
|
||
|
|
"learning_rate": 5.577423184847931e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 5354257.0,
|
||
|
|
"reward": 0.592968761920929,
|
||
|
|
"reward_std": 0.22307650744915009,
|
||
|
|
"rewards/format_reward/mean": 0.9296875,
|
||
|
|
"rewards/format_reward/std": 0.23345555365085602,
|
||
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
||
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
||
|
|
"step": 50
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 1,
|
||
|
|
"max_steps": 100,
|
||
|
|
"num_input_tokens_seen": 5354257,
|
||
|
|
"num_train_epochs": 1,
|
||
|
|
"save_steps": 50,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 0.0,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|