2735 lines
99 KiB
JSON
2735 lines
99 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.25,
|
|
"eval_steps": 500,
|
|
"global_step": 100,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1464.0,
|
|
"completions/mean_length": 477.234375,
|
|
"completions/mean_terminated_length": 426.56451416015625,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"epoch": 0.0025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.820798873901367,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.0,
|
|
"num_tokens": 128463.0,
|
|
"reward": 0.30078125,
|
|
"reward_std": 0.2949070334434509,
|
|
"rewards/format_reward/mean": 0.3515625,
|
|
"rewards/format_reward/std": 0.3294980227947235,
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 1
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1324.0,
|
|
"completions/max_terminated_length": 1324.0,
|
|
"completions/mean_length": 549.921875,
|
|
"completions/mean_terminated_length": 549.921875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.005,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.992785453796387,
|
|
"learning_rate": 2e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 224890.0,
|
|
"reward": 0.2890625,
|
|
"reward_std": 0.39714252948760986,
|
|
"rewards/format_reward/mean": 0.390625,
|
|
"rewards/format_reward/std": 0.301698237657547,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 2
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1812.0,
|
|
"completions/mean_length": 556.71875,
|
|
"completions/mean_terminated_length": 533.0476684570312,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0075,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.250213146209717,
|
|
"learning_rate": 4e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 361720.0,
|
|
"reward": 0.1874999850988388,
|
|
"reward_std": 0.3287465274333954,
|
|
"rewards/format_reward/mean": 0.3125,
|
|
"rewards/format_reward/std": 0.2597312331199646,
|
|
"rewards/mcq_exact_match_reward/mean": 0.15625,
|
|
"rewards/mcq_exact_match_reward/std": 0.36596253514289856,
|
|
"step": 3
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 2010.0,
|
|
"completions/max_terminated_length": 2010.0,
|
|
"completions/mean_length": 388.546875,
|
|
"completions/mean_terminated_length": 388.546875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.01,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.673406600952148,
|
|
"learning_rate": 6e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 469803.0,
|
|
"reward": 0.41718748211860657,
|
|
"reward_std": 0.42704811692237854,
|
|
"rewards/format_reward/mean": 0.421875,
|
|
"rewards/format_reward/std": 0.23935678601264954,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 4
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1047.0,
|
|
"completions/max_terminated_length": 1047.0,
|
|
"completions/mean_length": 299.203125,
|
|
"completions/mean_terminated_length": 299.203125,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.0125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.532109260559082,
|
|
"learning_rate": 8e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 581328.0,
|
|
"reward": 0.32656246423721313,
|
|
"reward_std": 0.25986582040786743,
|
|
"rewards/format_reward/mean": 0.296875,
|
|
"rewards/format_reward/std": 0.3177144229412079,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1405.0,
|
|
"completions/max_terminated_length": 1405.0,
|
|
"completions/mean_length": 549.515625,
|
|
"completions/mean_terminated_length": 549.515625,
|
|
"completions/min_length": 19.0,
|
|
"completions/min_terminated_length": 19.0,
|
|
"epoch": 0.015,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.551382064819336,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 695497.0,
|
|
"reward": 0.25078123807907104,
|
|
"reward_std": 0.3316580057144165,
|
|
"rewards/format_reward/mean": 0.4765625,
|
|
"rewards/format_reward/std": 0.28770697116851807,
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 6
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1751.0,
|
|
"completions/mean_length": 496.078125,
|
|
"completions/mean_terminated_length": 446.01611328125,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0175,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.735747337341309,
|
|
"learning_rate": 9.99726628670463e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 819054.0,
|
|
"reward": 0.41015625,
|
|
"reward_std": 0.42701074481010437,
|
|
"rewards/format_reward/mean": 0.3515625,
|
|
"rewards/format_reward/std": 0.24688033759593964,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 7
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1369.0,
|
|
"completions/max_terminated_length": 1369.0,
|
|
"completions/mean_length": 461.125,
|
|
"completions/mean_terminated_length": 461.125,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.02,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 2.3624932765960693,
|
|
"learning_rate": 9.989068136093872e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 918694.0,
|
|
"reward": 0.42656248807907104,
|
|
"reward_std": 0.3252020478248596,
|
|
"rewards/format_reward/mean": 0.359375,
|
|
"rewards/format_reward/std": 0.2741328477859497,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 8
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1707.0,
|
|
"completions/mean_length": 575.140625,
|
|
"completions/mean_terminated_length": 502.70489501953125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0225,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.942022800445557,
|
|
"learning_rate": 9.975414512725056e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1059111.0,
|
|
"reward": 0.32343748211860657,
|
|
"reward_std": 0.38642236590385437,
|
|
"rewards/format_reward/mean": 0.421875,
|
|
"rewards/format_reward/std": 0.18298126757144928,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 9
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1615.0,
|
|
"completions/mean_length": 552.0625,
|
|
"completions/mean_terminated_length": 528.3175048828125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.24199390411377,
|
|
"learning_rate": 9.956320346634875e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1199859.0,
|
|
"reward": 0.29140621423721313,
|
|
"reward_std": 0.28808674216270447,
|
|
"rewards/format_reward/mean": 0.4140625,
|
|
"rewards/format_reward/std": 0.209963858127594,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 971.0,
|
|
"completions/mean_length": 323.359375,
|
|
"completions/mean_terminated_length": 295.984130859375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0275,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 11.983031272888184,
|
|
"learning_rate": 9.931806517013612e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1308242.0,
|
|
"reward": 0.24374999105930328,
|
|
"reward_std": 0.26579102873802185,
|
|
"rewards/format_reward/mean": 0.40625,
|
|
"rewards/format_reward/std": 0.19669894874095917,
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 11
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1475.0,
|
|
"completions/max_terminated_length": 1475.0,
|
|
"completions/mean_length": 494.640625,
|
|
"completions/mean_terminated_length": 494.640625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.03,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 13.534098625183105,
|
|
"learning_rate": 9.901899829374047e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1427139.0,
|
|
"reward": 0.453125,
|
|
"reward_std": 0.37656593322753906,
|
|
"rewards/format_reward/mean": 0.46875,
|
|
"rewards/format_reward/std": 0.21593283116817474,
|
|
"rewards/mcq_exact_match_reward/mean": 0.40625,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 12
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1420.0,
|
|
"completions/max_terminated_length": 1420.0,
|
|
"completions/mean_length": 591.890625,
|
|
"completions/mean_terminated_length": 591.890625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0325,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 3.6356875896453857,
|
|
"learning_rate": 9.866632986240029e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1555604.0,
|
|
"reward": 0.24765624105930328,
|
|
"reward_std": 0.3406350612640381,
|
|
"rewards/format_reward/mean": 0.4453125,
|
|
"rewards/format_reward/std": 0.2538151443004608,
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 13
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1100.0,
|
|
"completions/max_terminated_length": 1100.0,
|
|
"completions/mean_length": 327.9375,
|
|
"completions/mean_terminated_length": 327.9375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.035,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 19.802650451660156,
|
|
"learning_rate": 9.826044551386742e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1663176.0,
|
|
"reward": 0.30781248211860657,
|
|
"reward_std": 0.3470980226993561,
|
|
"rewards/format_reward/mean": 0.421875,
|
|
"rewards/format_reward/std": 0.20351573824882507,
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 14
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1263.0,
|
|
"completions/mean_length": 416.265625,
|
|
"completions/mean_terminated_length": 390.3651123046875,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"epoch": 0.0375,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.966890335083008,
|
|
"learning_rate": 9.780178907671788e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1757385.0,
|
|
"reward": 0.3671875,
|
|
"reward_std": 0.37327155470848083,
|
|
"rewards/format_reward/mean": 0.546875,
|
|
"rewards/format_reward/std": 0.3299681544303894,
|
|
"rewards/mcq_exact_match_reward/mean": 0.3125,
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
|
"step": 15
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1576.0,
|
|
"completions/mean_length": 447.671875,
|
|
"completions/mean_terminated_length": 422.2698669433594,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.04,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 10.119935989379883,
|
|
"learning_rate": 9.729086208503173e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1892308.0,
|
|
"reward": 0.5914062261581421,
|
|
"reward_std": 0.31337296962738037,
|
|
"rewards/format_reward/mean": 0.4453125,
|
|
"rewards/format_reward/std": 0.15728822350502014,
|
|
"rewards/mcq_exact_match_reward/mean": 0.546875,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 16
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1743.0,
|
|
"completions/max_terminated_length": 1743.0,
|
|
"completions/mean_length": 421.59375,
|
|
"completions/mean_terminated_length": 421.59375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0425,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 10.146256446838379,
|
|
"learning_rate": 9.672822322997304e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2011562.0,
|
|
"reward": 0.29140621423721313,
|
|
"reward_std": 0.3035487234592438,
|
|
"rewards/format_reward/mean": 0.4140625,
|
|
"rewards/format_reward/std": 0.209963858127594,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 17
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1499.0,
|
|
"completions/max_terminated_length": 1499.0,
|
|
"completions/mean_length": 368.15625,
|
|
"completions/mean_terminated_length": 368.15625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.045,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 28.832923889160156,
|
|
"learning_rate": 9.611448774886923e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2107684.0,
|
|
"reward": 0.31328123807907104,
|
|
"reward_std": 0.32075032591819763,
|
|
"rewards/format_reward/mean": 0.4765625,
|
|
"rewards/format_reward/std": 0.22589658200740814,
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 18
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1508.0,
|
|
"completions/max_terminated_length": 1508.0,
|
|
"completions/mean_length": 311.4375,
|
|
"completions/mean_terminated_length": 311.4375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0475,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 17.548686981201172,
|
|
"learning_rate": 9.545032675245813e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2220144.0,
|
|
"reward": 0.5015624761581421,
|
|
"reward_std": 0.41764065623283386,
|
|
"rewards/format_reward/mean": 0.484375,
|
|
"rewards/format_reward/std": 0.08768405020236969,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 19
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1022.0,
|
|
"completions/mean_length": 172.484375,
|
|
"completions/mean_terminated_length": 142.71429443359375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.05,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 16.76396942138672,
|
|
"learning_rate": 9.473646649103817e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2318791.0,
|
|
"reward": 0.5539062023162842,
|
|
"reward_std": 0.17947588860988617,
|
|
"rewards/format_reward/mean": 0.5390625,
|
|
"rewards/format_reward/std": 0.16194961965084076,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 20
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1462.0,
|
|
"completions/max_terminated_length": 1462.0,
|
|
"completions/mean_length": 254.03125,
|
|
"completions/mean_terminated_length": 254.03125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0525,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 12.80144214630127,
|
|
"learning_rate": 9.397368756032444e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2429865.0,
|
|
"reward": 0.25312498211860657,
|
|
"reward_std": 0.33632034063339233,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.1259881556034088,
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 21
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 954.0,
|
|
"completions/max_terminated_length": 954.0,
|
|
"completions/mean_length": 104.65625,
|
|
"completions/mean_terminated_length": 104.65625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.055,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 10.90204906463623,
|
|
"learning_rate": 9.316282404787869e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2523307.0,
|
|
"reward": 0.4093749523162842,
|
|
"reward_std": 0.17358146607875824,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 22
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1322.0,
|
|
"completions/max_terminated_length": 1322.0,
|
|
"completions/mean_length": 377.9375,
|
|
"completions/mean_terminated_length": 377.9375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0575,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 11.154988288879395,
|
|
"learning_rate": 9.230476262104676e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2626679.0,
|
|
"reward": 0.22343748807907104,
|
|
"reward_std": 0.28182199597358704,
|
|
"rewards/format_reward/mean": 0.515625,
|
|
"rewards/format_reward/std": 0.1534975916147232,
|
|
"rewards/mcq_exact_match_reward/mean": 0.171875,
|
|
"rewards/mcq_exact_match_reward/std": 0.38025420904159546,
|
|
"step": 23
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 398.0,
|
|
"completions/max_terminated_length": 398.0,
|
|
"completions/mean_length": 12.125,
|
|
"completions/mean_terminated_length": 12.125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.06,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 11.924612045288086,
|
|
"learning_rate": 9.1400441557401e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2733791.0,
|
|
"reward": 0.4718749523162842,
|
|
"reward_std": 0.24831001460552216,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.421875,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 24
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 979.0,
|
|
"completions/max_terminated_length": 979.0,
|
|
"completions/mean_length": 173.625,
|
|
"completions/mean_terminated_length": 173.625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0625,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 17.104806900024414,
|
|
"learning_rate": 9.045084971874737e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2827215.0,
|
|
"reward": 0.51953125,
|
|
"reward_std": 0.22175219655036926,
|
|
"rewards/format_reward/mean": 0.5078125,
|
|
"rewards/format_reward/std": 0.1406387835741043,
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 25
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1075.0,
|
|
"completions/max_terminated_length": 1075.0,
|
|
"completions/mean_length": 161.796875,
|
|
"completions/mean_terminated_length": 161.796875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.065,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 24.93585968017578,
|
|
"learning_rate": 8.945702546981968e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2902970.0,
|
|
"reward": 0.40859371423721313,
|
|
"reward_std": 0.22210699319839478,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 26
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 928.0,
|
|
"completions/max_terminated_length": 928.0,
|
|
"completions/mean_length": 235.765625,
|
|
"completions/mean_terminated_length": 235.765625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0675,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 16.02617645263672,
|
|
"learning_rate": 8.842005554284295e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3005379.0,
|
|
"reward": 0.29999998211860657,
|
|
"reward_std": 0.2879316806793213,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.08908708393573761,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 27
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 935.0,
|
|
"completions/max_terminated_length": 935.0,
|
|
"completions/mean_length": 141.0,
|
|
"completions/mean_terminated_length": 141.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.07,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 29.071575164794922,
|
|
"learning_rate": 8.734107384920769e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3102595.0,
|
|
"reward": 0.4562499523162842,
|
|
"reward_std": 0.3846532702445984,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.08908708393573761,
|
|
"rewards/mcq_exact_match_reward/mean": 0.40625,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 28
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 961.0,
|
|
"completions/max_terminated_length": 961.0,
|
|
"completions/mean_length": 109.046875,
|
|
"completions/mean_terminated_length": 109.046875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0725,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 14.398436546325684,
|
|
"learning_rate": 8.622126023955445e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3190334.0,
|
|
"reward": 0.6898437142372131,
|
|
"reward_std": 0.19096830487251282,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 29
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 2034.0,
|
|
"completions/max_terminated_length": 2034.0,
|
|
"completions/mean_length": 313.390625,
|
|
"completions/mean_terminated_length": 313.390625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.075,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 23.27433967590332,
|
|
"learning_rate": 8.506183921362442e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3301191.0,
|
|
"reward": 0.6148437261581421,
|
|
"reward_std": 0.3767889142036438,
|
|
"rewards/format_reward/mean": 0.5234375,
|
|
"rewards/format_reward/std": 0.13886408507823944,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 30
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1409.0,
|
|
"completions/max_terminated_length": 1409.0,
|
|
"completions/mean_length": 302.03125,
|
|
"completions/mean_terminated_length": 302.03125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0775,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 9.451353073120117,
|
|
"learning_rate": 8.386407858128706e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3431513.0,
|
|
"reward": 0.3273437023162842,
|
|
"reward_std": 0.24055621027946472,
|
|
"rewards/format_reward/mean": 0.4609375,
|
|
"rewards/format_reward/std": 0.16194961965084076,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 31
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1716.0,
|
|
"completions/max_terminated_length": 1716.0,
|
|
"completions/mean_length": 140.75,
|
|
"completions/mean_terminated_length": 140.75,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.08,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 19.545513153076172,
|
|
"learning_rate": 8.262928807620843e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3516601.0,
|
|
"reward": 0.42890626192092896,
|
|
"reward_std": 0.0956839770078659,
|
|
"rewards/format_reward/mean": 0.5390625,
|
|
"rewards/format_reward/std": 0.2236899733543396,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 32
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 333.0,
|
|
"completions/max_terminated_length": 333.0,
|
|
"completions/mean_length": 26.1875,
|
|
"completions/mean_terminated_length": 26.1875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0825,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 11.879921913146973,
|
|
"learning_rate": 8.135881792367685e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3610013.0,
|
|
"reward": 0.7226561903953552,
|
|
"reward_std": 0.19158241152763367,
|
|
"rewards/format_reward/mean": 0.5078125,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.671875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 33
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1395.0,
|
|
"completions/max_terminated_length": 1395.0,
|
|
"completions/mean_length": 130.375,
|
|
"completions/mean_terminated_length": 130.375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.085,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 22.375093460083008,
|
|
"learning_rate": 8.005405736415125e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3706909.0,
|
|
"reward": 0.4156249761581421,
|
|
"reward_std": 0.34389790892601013,
|
|
"rewards/format_reward/mean": 0.5625,
|
|
"rewards/format_reward/std": 0.18898223340511322,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 34
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1169.0,
|
|
"completions/mean_length": 228.734375,
|
|
"completions/mean_terminated_length": 170.0483856201172,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0875,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 17.84543228149414,
|
|
"learning_rate": 7.871643313414718e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3815884.0,
|
|
"reward": 0.7546874284744263,
|
|
"reward_std": 0.28800931572914124,
|
|
"rewards/format_reward/mean": 0.515625,
|
|
"rewards/format_reward/std": 0.1534975916147232,
|
|
"rewards/mcq_exact_match_reward/mean": 0.703125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 35
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 311.0,
|
|
"completions/max_terminated_length": 311.0,
|
|
"completions/mean_length": 27.609375,
|
|
"completions/mean_terminated_length": 27.609375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.09,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 31.98973274230957,
|
|
"learning_rate": 7.734740790612136e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3941163.0,
|
|
"reward": 0.4898437261581421,
|
|
"reward_std": 0.2993735373020172,
|
|
"rewards/format_reward/mean": 0.5234375,
|
|
"rewards/format_reward/std": 0.10652101784944534,
|
|
"rewards/mcq_exact_match_reward/mean": 0.4375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 36
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1835.0,
|
|
"completions/max_terminated_length": 1835.0,
|
|
"completions/mean_length": 61.625,
|
|
"completions/mean_terminated_length": 61.625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0925,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 12.227835655212402,
|
|
"learning_rate": 7.594847868906076e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4027779.0,
|
|
"reward": 0.528124988079071,
|
|
"reward_std": 0.1930253505706787,
|
|
"rewards/format_reward/mean": 0.4375,
|
|
"rewards/format_reward/std": 0.1666666716337204,
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 37
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1583.0,
|
|
"completions/max_terminated_length": 1583.0,
|
|
"completions/mean_length": 239.546875,
|
|
"completions/mean_terminated_length": 239.546875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.095,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 20.220304489135742,
|
|
"learning_rate": 7.452117519152541e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4120606.0,
|
|
"reward": 0.39531248807907104,
|
|
"reward_std": 0.3611350357532501,
|
|
"rewards/format_reward/mean": 0.671875,
|
|
"rewards/format_reward/std": 0.2847827076911926,
|
|
"rewards/mcq_exact_match_reward/mean": 0.328125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 38
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 926.0,
|
|
"completions/max_terminated_length": 926.0,
|
|
"completions/mean_length": 72.34375,
|
|
"completions/mean_terminated_length": 72.34375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0975,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 21.585006713867188,
|
|
"learning_rate": 7.306705814893439e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4210956.0,
|
|
"reward": 0.37812501192092896,
|
|
"reward_std": 0.23370197415351868,
|
|
"rewards/format_reward/mean": 0.65625,
|
|
"rewards/format_reward/std": 0.233588308095932,
|
|
"rewards/mcq_exact_match_reward/mean": 0.3125,
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
|
"step": 39
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1230.0,
|
|
"completions/max_terminated_length": 1230.0,
|
|
"completions/mean_length": 107.265625,
|
|
"completions/mean_terminated_length": 107.265625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.1,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 19.986061096191406,
|
|
"learning_rate": 7.158771761692464e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4302349.0,
|
|
"reward": 0.28593748807907104,
|
|
"reward_std": 0.20377102494239807,
|
|
"rewards/format_reward/mean": 0.671875,
|
|
"rewards/format_reward/std": 0.23935678601264954,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 40
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 784.0,
|
|
"completions/max_terminated_length": 784.0,
|
|
"completions/mean_length": 170.578125,
|
|
"completions/mean_terminated_length": 170.578125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.1025,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 23.84389305114746,
|
|
"learning_rate": 7.008477123264847e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4402810.0,
|
|
"reward": 0.6890624761581421,
|
|
"reward_std": 0.3438800573348999,
|
|
"rewards/format_reward/mean": 0.640625,
|
|
"rewards/format_reward/std": 0.24346621334552765,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 41
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1099.0,
|
|
"completions/mean_length": 197.78125,
|
|
"completions/mean_terminated_length": 106.78688049316406,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.105,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 30.521644592285156,
|
|
"learning_rate": 6.855986244591103e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4512004.0,
|
|
"reward": 0.5382812023162842,
|
|
"reward_std": 0.25127214193344116,
|
|
"rewards/format_reward/mean": 0.6953125,
|
|
"rewards/format_reward/std": 0.31644338369369507,
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 42
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1161.0,
|
|
"completions/mean_length": 134.6875,
|
|
"completions/mean_terminated_length": 104.31746673583984,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.1075,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 28.4834041595459,
|
|
"learning_rate": 6.701465872208216e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4623384.0,
|
|
"reward": 0.2992187440395355,
|
|
"reward_std": 0.23480820655822754,
|
|
"rewards/format_reward/mean": 0.8046875,
|
|
"rewards/format_reward/std": 0.29028159379959106,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 43
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 715.0,
|
|
"completions/max_terminated_length": 715.0,
|
|
"completions/mean_length": 76.203125,
|
|
"completions/mean_terminated_length": 76.203125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.11,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 14.868675231933594,
|
|
"learning_rate": 6.545084971874736e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4706005.0,
|
|
"reward": 0.32734376192092896,
|
|
"reward_std": 0.24423527717590332,
|
|
"rewards/format_reward/mean": 0.9296875,
|
|
"rewards/format_reward/std": 0.1751912236213684,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 44
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 668.0,
|
|
"completions/mean_length": 154.5625,
|
|
"completions/mean_terminated_length": 61.44261932373047,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.1125,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 15.19470500946045,
|
|
"learning_rate": 6.387014543809223e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4798985.0,
|
|
"reward": 0.5421874523162842,
|
|
"reward_std": 0.3319449722766876,
|
|
"rewards/format_reward/mean": 0.734375,
|
|
"rewards/format_reward/std": 0.30820462107658386,
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 45
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 543.0,
|
|
"completions/mean_length": 72.453125,
|
|
"completions/mean_terminated_length": 41.09524154663086,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.115,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 15.99255084991455,
|
|
"learning_rate": 6.227427435703995e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4918510.0,
|
|
"reward": 0.31406253576278687,
|
|
"reward_std": 0.0530330091714859,
|
|
"rewards/format_reward/mean": 0.796875,
|
|
"rewards/format_reward/std": 0.3642643094062805,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 46
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 608.0,
|
|
"completions/mean_length": 79.3125,
|
|
"completions/mean_terminated_length": 48.06349563598633,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.1175,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 18.731386184692383,
|
|
"learning_rate": 6.066498153718734e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5018826.0,
|
|
"reward": 0.6578124761581421,
|
|
"reward_std": 0.29044055938720703,
|
|
"rewards/format_reward/mean": 0.796875,
|
|
"rewards/format_reward/std": 0.3177144229412079,
|
|
"rewards/mcq_exact_match_reward/mean": 0.578125,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 47
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 855.0,
|
|
"completions/max_terminated_length": 855.0,
|
|
"completions/mean_length": 66.71875,
|
|
"completions/mean_terminated_length": 66.71875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.12,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 18.806800842285156,
|
|
"learning_rate": 5.90440267166055e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5152376.0,
|
|
"reward": 0.3304687738418579,
|
|
"reward_std": 0.30621567368507385,
|
|
"rewards/format_reward/mean": 0.8046875,
|
|
"rewards/format_reward/std": 0.37392371892929077,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 48
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 774.0,
|
|
"completions/mean_length": 102.953125,
|
|
"completions/mean_terminated_length": 72.0793685913086,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.1225,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 34.07569885253906,
|
|
"learning_rate": 5.741318238559209e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5259269.0,
|
|
"reward": 0.45234376192092896,
|
|
"reward_std": 0.2820115089416504,
|
|
"rewards/format_reward/mean": 0.7734375,
|
|
"rewards/format_reward/std": 0.2807259261608124,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 49
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 414.0,
|
|
"completions/mean_length": 127.9375,
|
|
"completions/mean_terminated_length": 66.0,
|
|
"completions/min_length": 11.0,
|
|
"completions/min_terminated_length": 11.0,
|
|
"epoch": 0.125,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 27.46225929260254,
|
|
"learning_rate": 5.577423184847931e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5354257.0,
|
|
"reward": 0.592968761920929,
|
|
"reward_std": 0.22307650744915009,
|
|
"rewards/format_reward/mean": 0.9296875,
|
|
"rewards/format_reward/std": 0.23345555365085602,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 50
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1015.0,
|
|
"completions/mean_length": 120.328125,
|
|
"completions/mean_terminated_length": 89.73016357421875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.1275,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 18.311309814453125,
|
|
"learning_rate": 5.412896727361662e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5452862.0,
|
|
"reward": 0.510937511920929,
|
|
"reward_std": 0.3758324980735779,
|
|
"rewards/format_reward/mean": 0.890625,
|
|
"rewards/format_reward/std": 0.2592533528804779,
|
|
"rewards/mcq_exact_match_reward/mean": 0.421875,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 51
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1425.0,
|
|
"completions/mean_length": 101.140625,
|
|
"completions/mean_terminated_length": 70.23809814453125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.13,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 15.757895469665527,
|
|
"learning_rate": 5.247918773366111e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5554471.0,
|
|
"reward": 0.43046876788139343,
|
|
"reward_std": 0.20688433945178986,
|
|
"rewards/format_reward/mean": 0.8671875,
|
|
"rewards/format_reward/std": 0.28510910272598267,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 52
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1296.0,
|
|
"completions/max_terminated_length": 1296.0,
|
|
"completions/mean_length": 67.234375,
|
|
"completions/mean_terminated_length": 67.234375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.1325,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 11.062511444091797,
|
|
"learning_rate": 5.082669723831793e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5651566.0,
|
|
"reward": 0.42109376192092896,
|
|
"reward_std": 0.17108294367790222,
|
|
"rewards/format_reward/mean": 0.9296875,
|
|
"rewards/format_reward/std": 0.1751912236213684,
|
|
"rewards/mcq_exact_match_reward/mean": 0.328125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 53
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 447.0,
|
|
"completions/mean_length": 153.5,
|
|
"completions/mean_terminated_length": 27.200000762939453,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.135,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 16.75969123840332,
|
|
"learning_rate": 4.917330276168208e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5753262.0,
|
|
"reward": 0.6187499761581421,
|
|
"reward_std": 0.307129442691803,
|
|
"rewards/format_reward/mean": 0.875,
|
|
"rewards/format_reward/std": 0.28171807527542114,
|
|
"rewards/mcq_exact_match_reward/mean": 0.53125,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 54
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 462.0,
|
|
"completions/max_terminated_length": 462.0,
|
|
"completions/mean_length": 48.203125,
|
|
"completions/mean_terminated_length": 48.203125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.1375,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 14.472973823547363,
|
|
"learning_rate": 4.752081226633888e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5829763.0,
|
|
"reward": 0.45390626788139343,
|
|
"reward_std": 0.1621313989162445,
|
|
"rewards/format_reward/mean": 0.9453125,
|
|
"rewards/format_reward/std": 0.15728822350502014,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 55
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 488.0,
|
|
"completions/mean_length": 70.390625,
|
|
"completions/mean_terminated_length": 39.000003814697266,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.14,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 9.199593544006348,
|
|
"learning_rate": 4.5871032726383385e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5927572.0,
|
|
"reward": 0.18906250596046448,
|
|
"reward_std": 0.11857090145349503,
|
|
"rewards/format_reward/mean": 0.953125,
|
|
"rewards/format_reward/std": 0.21304203569889069,
|
|
"rewards/mcq_exact_match_reward/mean": 0.09375,
|
|
"rewards/mcq_exact_match_reward/std": 0.29378482699394226,
|
|
"step": 56
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 872.0,
|
|
"completions/mean_length": 107.125,
|
|
"completions/mean_terminated_length": 44.51612854003906,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.1425,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 14.753751754760742,
|
|
"learning_rate": 4.4225768151520694e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6032756.0,
|
|
"reward": 0.45000001788139343,
|
|
"reward_std": 0.28659987449645996,
|
|
"rewards/format_reward/mean": 0.90625,
|
|
"rewards/format_reward/std": 0.233588308095932,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 57
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 553.0,
|
|
"completions/max_terminated_length": 553.0,
|
|
"completions/mean_length": 36.59375,
|
|
"completions/mean_terminated_length": 36.59375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.145,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 18.922006607055664,
|
|
"learning_rate": 4.258681761440789e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6120570.0,
|
|
"reward": 0.37968751788139343,
|
|
"reward_std": 0.24809977412223816,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.125,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 58
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 667.0,
|
|
"completions/mean_length": 143.96875,
|
|
"completions/mean_terminated_length": 50.32786560058594,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1475,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 6.868103504180908,
|
|
"learning_rate": 4.095597328339452e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6236432.0,
|
|
"reward": 0.5914062261581421,
|
|
"reward_std": 0.20316563546657562,
|
|
"rewards/format_reward/mean": 0.9140625,
|
|
"rewards/format_reward/std": 0.27537402510643005,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 59
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 407.0,
|
|
"completions/max_terminated_length": 407.0,
|
|
"completions/mean_length": 21.84375,
|
|
"completions/mean_terminated_length": 21.84375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.15,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 15.219839096069336,
|
|
"learning_rate": 3.9335018462812664e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6327390.0,
|
|
"reward": 0.7085937857627869,
|
|
"reward_std": 0.2691788673400879,
|
|
"rewards/format_reward/mean": 0.9921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 60
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 164.0,
|
|
"completions/mean_length": 80.140625,
|
|
"completions/mean_terminated_length": 16.66128921508789,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1525,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 10.40705394744873,
|
|
"learning_rate": 3.772572564296004e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6440111.0,
|
|
"reward": 0.5804687738418579,
|
|
"reward_std": 0.24087271094322205,
|
|
"rewards/format_reward/mean": 0.9609375,
|
|
"rewards/format_reward/std": 0.18483558297157288,
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 61
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 366.0,
|
|
"completions/mean_length": 58.875,
|
|
"completions/mean_terminated_length": 27.30158805847168,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.155,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 16.986087799072266,
|
|
"learning_rate": 3.612985456190778e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6518671.0,
|
|
"reward": 0.7054687738418579,
|
|
"reward_std": 0.26613086462020874,
|
|
"rewards/format_reward/mean": 0.9609375,
|
|
"rewards/format_reward/std": 0.18483558297157288,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 62
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 460.0,
|
|
"completions/mean_length": 122.21875,
|
|
"completions/mean_terminated_length": 27.508195877075195,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1575,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 16.218698501586914,
|
|
"learning_rate": 3.454915028125263e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6611805.0,
|
|
"reward": 0.561718761920929,
|
|
"reward_std": 0.14248578250408173,
|
|
"rewards/format_reward/mean": 0.9296875,
|
|
"rewards/format_reward/std": 0.2498759627342224,
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 63
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 876.0,
|
|
"completions/max_terminated_length": 876.0,
|
|
"completions/mean_length": 53.1875,
|
|
"completions/mean_terminated_length": 53.1875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.16,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 12.511017799377441,
|
|
"learning_rate": 3.2985341277917846e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6698345.0,
|
|
"reward": 0.32890626788139343,
|
|
"reward_std": 0.04640388861298561,
|
|
"rewards/format_reward/mean": 0.9453125,
|
|
"rewards/format_reward/std": 0.15728822350502014,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 64
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 20.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 13.28125,
|
|
"completions/mean_terminated_length": 13.28125,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1625,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 4.530981063842773,
|
|
"learning_rate": 3.1440137554088953e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6789451.0,
|
|
"reward": 0.3968750238418579,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 65
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1223.0,
|
|
"completions/mean_length": 134.359375,
|
|
"completions/mean_terminated_length": 72.6290283203125,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.165,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 16.526830673217773,
|
|
"learning_rate": 2.9915228767351535e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6907946.0,
|
|
"reward": 0.48593753576278687,
|
|
"reward_std": 0.18089531362056732,
|
|
"rewards/format_reward/mean": 0.953125,
|
|
"rewards/format_reward/std": 0.19352105259895325,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 66
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 481.0,
|
|
"completions/max_terminated_length": 481.0,
|
|
"completions/mean_length": 35.515625,
|
|
"completions/mean_terminated_length": 35.515625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.1675,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 9.258106231689453,
|
|
"learning_rate": 2.841228238307536e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 6978851.0,
|
|
"reward": 0.8031250238418579,
|
|
"reward_std": 0.19044628739356995,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.703125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 67
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 618.0,
|
|
"completions/mean_length": 63.296875,
|
|
"completions/mean_terminated_length": 31.79365348815918,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.17,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 18.417194366455078,
|
|
"learning_rate": 2.6932941851065615e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7080566.0,
|
|
"reward": 0.6421874761581421,
|
|
"reward_std": 0.29052332043647766,
|
|
"rewards/format_reward/mean": 0.953125,
|
|
"rewards/format_reward/std": 0.21304203569889069,
|
|
"rewards/mcq_exact_match_reward/mean": 0.546875,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 68
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 550.0,
|
|
"completions/max_terminated_length": 550.0,
|
|
"completions/mean_length": 33.65625,
|
|
"completions/mean_terminated_length": 33.65625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.1725,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 28.758838653564453,
|
|
"learning_rate": 2.547882480847461e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7180000.0,
|
|
"reward": 0.534375011920929,
|
|
"reward_std": 0.33078908920288086,
|
|
"rewards/format_reward/mean": 0.96875,
|
|
"rewards/format_reward/std": 0.12198751419782639,
|
|
"rewards/mcq_exact_match_reward/mean": 0.4375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 69
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 480.0,
|
|
"completions/max_terminated_length": 480.0,
|
|
"completions/mean_length": 20.453125,
|
|
"completions/mean_terminated_length": 20.453125,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.175,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 11.625492095947266,
|
|
"learning_rate": 2.4051521310939254e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7277581.0,
|
|
"reward": 0.5843750238418579,
|
|
"reward_std": 0.15992169082164764,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 70
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 192.0,
|
|
"completions/max_terminated_length": 192.0,
|
|
"completions/mean_length": 18.796875,
|
|
"completions/mean_terminated_length": 18.796875,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1775,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 8.98841667175293,
|
|
"learning_rate": 2.2652592093878665e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7354840.0,
|
|
"reward": 0.41093751788139343,
|
|
"reward_std": 0.11330723762512207,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.125,
|
|
"rewards/mcq_exact_match_reward/mean": 0.3125,
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
|
"step": 71
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 336.0,
|
|
"completions/max_terminated_length": 336.0,
|
|
"completions/mean_length": 20.78125,
|
|
"completions/mean_terminated_length": 20.78125,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.18,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 11.35882568359375,
|
|
"learning_rate": 2.128356686585282e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7438378.0,
|
|
"reward": 0.45859378576278687,
|
|
"reward_std": 0.17108294367790222,
|
|
"rewards/format_reward/mean": 0.9921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 72
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 15.0,
|
|
"completions/max_terminated_length": 15.0,
|
|
"completions/mean_length": 13.03125,
|
|
"completions/mean_terminated_length": 13.03125,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1825,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 11.875134468078613,
|
|
"learning_rate": 1.9945942635848745e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 7510852.0,
|
|
"reward": 0.7390625476837158,
|
|
"reward_std": 0.17329266667366028,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.125,
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 73
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 370.0,
|
|
"completions/max_terminated_length": 370.0,
|
|
"completions/mean_length": 19.203125,
|
|
"completions/mean_terminated_length": 19.203125,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.185,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 13.027864456176758,
|
|
"learning_rate": 1.8641182076323148e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7608273.0,
|
|
"reward": 0.7390625476837158,
|
|
"reward_std": 0.18484057486057281,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.125,
|
|
"rewards/mcq_exact_match_reward/mean": 0.640625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 74
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1156.0,
|
|
"completions/max_terminated_length": 1156.0,
|
|
"completions/mean_length": 50.65625,
|
|
"completions/mean_terminated_length": 50.65625,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1875,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 8.088542938232422,
|
|
"learning_rate": 1.7370711923791564e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 7691619.0,
|
|
"reward": 0.5843750238418579,
|
|
"reward_std": 0.16887325048446655,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 75
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1471.0,
|
|
"completions/max_terminated_length": 1471.0,
|
|
"completions/mean_length": 43.625,
|
|
"completions/mean_terminated_length": 43.625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.19,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 10.211929321289062,
|
|
"learning_rate": 1.6135921418712955e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 7776755.0,
|
|
"reward": 0.8812500238418579,
|
|
"reward_std": 0.1552036553621292,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.78125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 76
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 249.0,
|
|
"completions/max_terminated_length": 249.0,
|
|
"completions/mean_length": 16.90625,
|
|
"completions/mean_terminated_length": 16.90625,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1925,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 12.14816665649414,
|
|
"learning_rate": 1.493816078637557e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 7856829.0,
|
|
"reward": 0.4437500238418579,
|
|
"reward_std": 0.1552036553621292,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 77
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 400.0,
|
|
"completions/max_terminated_length": 400.0,
|
|
"completions/mean_length": 17.484375,
|
|
"completions/mean_terminated_length": 17.484375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.195,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 8.064270973205566,
|
|
"learning_rate": 1.3778739760445552e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 7957444.0,
|
|
"reward": 0.6195312738418579,
|
|
"reward_std": 0.06007346510887146,
|
|
"rewards/format_reward/mean": 0.8828125,
|
|
"rewards/format_reward/std": 0.21347814798355103,
|
|
"rewards/mcq_exact_match_reward/mean": 0.53125,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 78
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 862.0,
|
|
"completions/max_terminated_length": 862.0,
|
|
"completions/mean_length": 28.59375,
|
|
"completions/mean_terminated_length": 28.59375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.1975,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 15.456476211547852,
|
|
"learning_rate": 1.2658926150792322e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8047770.0,
|
|
"reward": 0.753125011920929,
|
|
"reward_std": 0.23930205404758453,
|
|
"rewards/format_reward/mean": 0.96875,
|
|
"rewards/format_reward/std": 0.1510545015335083,
|
|
"rewards/mcq_exact_match_reward/mean": 0.65625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 79
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 413.0,
|
|
"completions/max_terminated_length": 413.0,
|
|
"completions/mean_length": 23.6875,
|
|
"completions/mean_terminated_length": 23.6875,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.2,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 6.343238353729248,
|
|
"learning_rate": 1.1579944457157059e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 8158614.0,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 80
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 19.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 12.328125,
|
|
"completions/mean_terminated_length": 12.328125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.2025,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 19.15117073059082,
|
|
"learning_rate": 1.0542974530180327e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 8261035.0,
|
|
"reward": 0.5007812976837158,
|
|
"reward_std": 0.1689612865447998,
|
|
"rewards/format_reward/mean": 0.9453125,
|
|
"rewards/format_reward/std": 0.15728822350502014,
|
|
"rewards/mcq_exact_match_reward/mean": 0.40625,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 81
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 118.0,
|
|
"completions/max_terminated_length": 118.0,
|
|
"completions/mean_length": 16.375,
|
|
"completions/mean_terminated_length": 16.375,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.205,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 12.861918449401855,
|
|
"learning_rate": 9.549150281252632e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 8359787.0,
|
|
"reward": 0.20937500894069672,
|
|
"reward_std": 0.16887325048446655,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.109375,
|
|
"rewards/mcq_exact_match_reward/std": 0.3145764470100403,
|
|
"step": 82
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 119.0,
|
|
"completions/max_terminated_length": 119.0,
|
|
"completions/mean_length": 15.03125,
|
|
"completions/mean_terminated_length": 15.03125,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.2075,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 14.756561279296875,
|
|
"learning_rate": 8.599558442598998e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 8441949.0,
|
|
"reward": 0.6156250238418579,
|
|
"reward_std": 0.1315089464187622,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 83
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 310.0,
|
|
"completions/max_terminated_length": 310.0,
|
|
"completions/mean_length": 19.09375,
|
|
"completions/mean_terminated_length": 19.09375,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.21,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 10.06808090209961,
|
|
"learning_rate": 7.695237378953224e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 8543755.0,
|
|
"reward": 0.8500000238418579,
|
|
"reward_std": 0.19727616012096405,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 84
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 75.015625,
|
|
"completions/mean_terminated_length": 11.370967864990234,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.2125,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 14.676437377929688,
|
|
"learning_rate": 6.837175952121304e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 8640956.0,
|
|
"reward": 0.4742187559604645,
|
|
"reward_std": 0.34538891911506653,
|
|
"rewards/format_reward/mean": 0.8359375,
|
|
"rewards/format_reward/std": 0.2824873626232147,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 85
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 22.0,
|
|
"completions/max_terminated_length": 22.0,
|
|
"completions/mean_length": 13.515625,
|
|
"completions/mean_terminated_length": 13.515625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.215,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 14.535361289978027,
|
|
"learning_rate": 6.026312439675551e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 8729309.0,
|
|
"reward": 0.55078125,
|
|
"reward_std": 0.1658112108707428,
|
|
"rewards/format_reward/mean": 0.9765625,
|
|
"rewards/format_reward/std": 0.13886408507823944,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 86
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 101.0,
|
|
"completions/mean_length": 46.234375,
|
|
"completions/mean_terminated_length": 14.460318565368652,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.2175,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 18.44989776611328,
|
|
"learning_rate": 5.263533508961826e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 8833324.0,
|
|
"reward": 0.81640625,
|
|
"reward_std": 0.30399811267852783,
|
|
"rewards/format_reward/mean": 0.9765625,
|
|
"rewards/format_reward/std": 0.13886408507823944,
|
|
"rewards/mcq_exact_match_reward/mean": 0.71875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 87
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 347.0,
|
|
"completions/max_terminated_length": 347.0,
|
|
"completions/mean_length": 22.5625,
|
|
"completions/mean_terminated_length": 22.5625,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.22,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 14.351069450378418,
|
|
"learning_rate": 4.549673247541874e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 8932336.0,
|
|
"reward": 0.6781250238418579,
|
|
"reward_std": 0.2519446909427643,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.578125,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 88
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 388.0,
|
|
"completions/max_terminated_length": 388.0,
|
|
"completions/mean_length": 27.359375,
|
|
"completions/mean_terminated_length": 27.359375,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.2225,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 14.434761047363281,
|
|
"learning_rate": 3.8855122511307626e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 9044767.0,
|
|
"reward": 0.6312500238418579,
|
|
"reward_std": 0.31300368905067444,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.53125,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 89
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1093.0,
|
|
"completions/max_terminated_length": 1093.0,
|
|
"completions/mean_length": 30.03125,
|
|
"completions/mean_terminated_length": 30.03125,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.225,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 15.73192310333252,
|
|
"learning_rate": 3.271776770026963e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 9157089.0,
|
|
"reward": 0.5367187857627869,
|
|
"reward_std": 0.1834089457988739,
|
|
"rewards/format_reward/mean": 0.9921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.4375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 90
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 499.0,
|
|
"completions/max_terminated_length": 499.0,
|
|
"completions/mean_length": 26.140625,
|
|
"completions/mean_terminated_length": 26.140625,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.2275,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 15.007508277893066,
|
|
"learning_rate": 2.7091379149682682e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 9224186.0,
|
|
"reward": 0.7718750238418579,
|
|
"reward_std": 0.15992169082164764,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.671875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 91
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 305.0,
|
|
"completions/max_terminated_length": 305.0,
|
|
"completions/mean_length": 19.234375,
|
|
"completions/mean_terminated_length": 19.234375,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.23,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 7.761956691741943,
|
|
"learning_rate": 2.1982109232821176e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 9321929.0,
|
|
"reward": 0.5218750238418579,
|
|
"reward_std": 0.1530819982290268,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.421875,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 92
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 45.453125,
|
|
"completions/mean_terminated_length": 13.666666984558105,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.2325,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 12.21143627166748,
|
|
"learning_rate": 1.7395544861325718e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 9410454.0,
|
|
"reward": 0.64453125,
|
|
"reward_std": 0.07132276892662048,
|
|
"rewards/format_reward/mean": 0.9765625,
|
|
"rewards/format_reward/std": 0.13886408507823944,
|
|
"rewards/mcq_exact_match_reward/mean": 0.546875,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 93
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 321.0,
|
|
"completions/mean_length": 87.828125,
|
|
"completions/mean_terminated_length": 24.596773147583008,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.235,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 16.356138229370117,
|
|
"learning_rate": 1.3336701375997127e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 9519363.0,
|
|
"reward": 0.7523437738418579,
|
|
"reward_std": 0.23261141777038574,
|
|
"rewards/format_reward/mean": 0.9609375,
|
|
"rewards/format_reward/std": 0.18483558297157288,
|
|
"rewards/mcq_exact_match_reward/mean": 0.65625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 94
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 400.0,
|
|
"completions/max_terminated_length": 400.0,
|
|
"completions/mean_length": 19.109375,
|
|
"completions/mean_terminated_length": 19.109375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.2375,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 7.05694055557251,
|
|
"learning_rate": 9.810017062595321e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 9619178.0,
|
|
"reward": 0.5835937261581421,
|
|
"reward_std": 0.12194531410932541,
|
|
"rewards/format_reward/mean": 0.9921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 95
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 326.0,
|
|
"completions/max_terminated_length": 326.0,
|
|
"completions/mean_length": 23.65625,
|
|
"completions/mean_terminated_length": 23.65625,
|
|
"completions/min_length": 12.0,
|
|
"completions/min_terminated_length": 12.0,
|
|
"epoch": 0.24,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 13.755544662475586,
|
|
"learning_rate": 6.819348298638839e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 9730556.0,
|
|
"reward": 0.3476562798023224,
|
|
"reward_std": 0.2659573554992676,
|
|
"rewards/format_reward/mean": 0.9765625,
|
|
"rewards/format_reward/std": 0.13886408507823944,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 96
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 278.0,
|
|
"completions/mean_length": 52.09375,
|
|
"completions/mean_terminated_length": 20.41269874572754,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.2425,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 16.00373077392578,
|
|
"learning_rate": 4.367965336512403e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 9832362.0,
|
|
"reward": 0.6765625476837158,
|
|
"reward_std": 0.303753525018692,
|
|
"rewards/format_reward/mean": 0.984375,
|
|
"rewards/format_reward/std": 0.125,
|
|
"rewards/mcq_exact_match_reward/mean": 0.578125,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 97
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 43.0,
|
|
"completions/max_terminated_length": 43.0,
|
|
"completions/mean_length": 13.59375,
|
|
"completions/mean_terminated_length": 13.59375,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.245,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 11.894449234008789,
|
|
"learning_rate": 2.458548727494292e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 9954448.0,
|
|
"reward": 0.37890625,
|
|
"reward_std": 0.09131823480129242,
|
|
"rewards/format_reward/mean": 0.9765625,
|
|
"rewards/format_reward/std": 0.10652101784944534,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 98
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 584.0,
|
|
"completions/max_terminated_length": 584.0,
|
|
"completions/mean_length": 38.921875,
|
|
"completions/mean_terminated_length": 38.921875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.2475,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 21.100183486938477,
|
|
"learning_rate": 1.0931863906127325e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 10048931.0,
|
|
"reward": 0.4414062798023224,
|
|
"reward_std": 0.2259259968996048,
|
|
"rewards/format_reward/mean": 0.9765625,
|
|
"rewards/format_reward/std": 0.10652101784944534,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 99
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 18.0,
|
|
"completions/max_terminated_length": 18.0,
|
|
"completions/mean_length": 13.078125,
|
|
"completions/mean_terminated_length": 13.078125,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.25,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 9.943309783935547,
|
|
"learning_rate": 2.733713295369755e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 10141176.0,
|
|
"reward": 0.7718750238418579,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.671875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 100
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 100,
|
|
"num_input_tokens_seen": 10141176,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 50,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|