Files
goldengoose-corr-v4-0.25-200/checkpoint-100/trainer_state.json
ModelHub XC 08060cde78 初始化项目,由ModelHub XC社区提供模型
Model: cjiao/goldengoose-corr-v4-0.25-200
Source: Original Platform
2026-05-30 01:15:26 +08:00

2735 lines
98 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.125,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1743.0,
"completions/mean_length": 471.375,
"completions/mean_terminated_length": 446.3492431640625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.00125,
"frac_reward_zero_std": 0.0,
"grad_norm": 27.719789505004883,
"learning_rate": 0.0,
"loss": -0.0,
"num_tokens": 107576.0,
"reward": 0.32109373807907104,
"reward_std": 0.35813236236572266,
"rewards/format_reward/mean": 0.3984375,
"rewards/format_reward/std": 0.31090864539146423,
"rewards/mcq_exact_match_reward/mean": 0.28125,
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1343.0,
"completions/max_terminated_length": 1343.0,
"completions/mean_length": 420.609375,
"completions/mean_terminated_length": 420.609375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0025,
"frac_reward_zero_std": 0.0,
"grad_norm": 15.845207214355469,
"learning_rate": 1e-07,
"loss": -0.0,
"num_tokens": 217615.0,
"reward": 0.0898437350988388,
"reward_std": 0.1637348234653473,
"rewards/format_reward/mean": 0.2734375,
"rewards/format_reward/std": 0.2662152051925659,
"rewards/mcq_exact_match_reward/mean": 0.0625,
"rewards/mcq_exact_match_reward/std": 0.24397502839565277,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1600.0,
"completions/mean_length": 604.359375,
"completions/mean_terminated_length": 581.4444580078125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.00375,
"frac_reward_zero_std": 0.125,
"grad_norm": 3.8941690921783447,
"learning_rate": 2e-07,
"loss": -0.0,
"num_tokens": 346454.0,
"reward": 0.2398437261581421,
"reward_std": 0.34310027956962585,
"rewards/format_reward/mean": 0.3671875,
"rewards/format_reward/std": 0.23974503576755524,
"rewards/mcq_exact_match_reward/mean": 0.203125,
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1399.0,
"completions/max_terminated_length": 1399.0,
"completions/mean_length": 485.09375,
"completions/mean_terminated_length": 485.09375,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.005,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.741612672805786,
"learning_rate": 3e-07,
"loss": 0.0,
"num_tokens": 453380.0,
"reward": 0.18437498807907104,
"reward_std": 0.2578701674938202,
"rewards/format_reward/mean": 0.4375,
"rewards/format_reward/std": 0.33923351764678955,
"rewards/mcq_exact_match_reward/mean": 0.140625,
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1614.0,
"completions/mean_length": 588.8125,
"completions/mean_terminated_length": 517.0491333007812,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.00625,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.286240339279175,
"learning_rate": 4e-07,
"loss": -0.0,
"num_tokens": 582664.0,
"reward": 0.25390625,
"reward_std": 0.264077365398407,
"rewards/format_reward/mean": 0.3515625,
"rewards/format_reward/std": 0.24688033759593964,
"rewards/mcq_exact_match_reward/mean": 0.21875,
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1609.0,
"completions/mean_length": 472.859375,
"completions/mean_terminated_length": 447.857177734375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.0075,
"frac_reward_zero_std": 0.125,
"grad_norm": 14.97139835357666,
"learning_rate": 5e-07,
"loss": 0.0,
"num_tokens": 689791.0,
"reward": 0.17031249403953552,
"reward_std": 0.27876684069633484,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.2916666865348816,
"rewards/mcq_exact_match_reward/mean": 0.140625,
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1476.0,
"completions/mean_length": 554.796875,
"completions/mean_terminated_length": 531.0952758789062,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.00875,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.5775206089019775,
"learning_rate": 6e-07,
"loss": -0.0,
"num_tokens": 802154.0,
"reward": 0.26953125,
"reward_std": 0.3855266869068146,
"rewards/format_reward/mean": 0.3515625,
"rewards/format_reward/std": 0.2302463799715042,
"rewards/mcq_exact_match_reward/mean": 0.234375,
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1346.0,
"completions/mean_length": 447.140625,
"completions/mean_terminated_length": 421.7301940917969,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.01,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.306600570678711,
"learning_rate": 7e-07,
"loss": -0.0,
"num_tokens": 909003.0,
"reward": 0.3125,
"reward_std": 0.41604068875312805,
"rewards/format_reward/mean": 0.3125,
"rewards/format_reward/std": 0.2745848298072815,
"rewards/mcq_exact_match_reward/mean": 0.28125,
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1898.0,
"completions/max_terminated_length": 1898.0,
"completions/mean_length": 663.96875,
"completions/mean_terminated_length": 663.96875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.01125,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.932246208190918,
"learning_rate": 8e-07,
"loss": 0.0,
"num_tokens": 1022305.0,
"reward": 0.1484375,
"reward_std": 0.2908669710159302,
"rewards/format_reward/mean": 0.390625,
"rewards/format_reward/std": 0.2592533528804779,
"rewards/mcq_exact_match_reward/mean": 0.109375,
"rewards/mcq_exact_match_reward/std": 0.3145764470100403,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1804.0,
"completions/max_terminated_length": 1804.0,
"completions/mean_length": 508.078125,
"completions/mean_terminated_length": 508.078125,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.0125,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.8813296556472778,
"learning_rate": 9e-07,
"loss": 0.0,
"num_tokens": 1149174.0,
"reward": 0.39531248807907104,
"reward_std": 0.25146484375,
"rewards/format_reward/mean": 0.359375,
"rewards/format_reward/std": 0.2741328477859497,
"rewards/mcq_exact_match_reward/mean": 0.359375,
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1901.0,
"completions/max_terminated_length": 1901.0,
"completions/mean_length": 587.140625,
"completions/mean_terminated_length": 587.140625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.01375,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.928985118865967,
"learning_rate": 1e-06,
"loss": -0.0,
"num_tokens": 1288087.0,
"reward": 0.26640623807907104,
"reward_std": 0.413688063621521,
"rewards/format_reward/mean": 0.3203125,
"rewards/format_reward/std": 0.27265870571136475,
"rewards/mcq_exact_match_reward/mean": 0.234375,
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1818.0,
"completions/max_terminated_length": 1818.0,
"completions/mean_length": 631.4375,
"completions/mean_terminated_length": 631.4375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.015,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.051449298858643,
"learning_rate": 9.999316524962345e-07,
"loss": -0.0,
"num_tokens": 1432555.0,
"reward": 0.24062499403953552,
"reward_std": 0.37458372116088867,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.2357022762298584,
"rewards/mcq_exact_match_reward/mean": 0.203125,
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1405.0,
"completions/max_terminated_length": 1405.0,
"completions/mean_length": 502.890625,
"completions/mean_terminated_length": 502.890625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.01625,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.412359237670898,
"learning_rate": 9.99726628670463e-07,
"loss": 0.0,
"num_tokens": 1562860.0,
"reward": 0.3140624761581421,
"reward_std": 0.4029204845428467,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.23935678601264954,
"rewards/mcq_exact_match_reward/mean": 0.28125,
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 911.0,
"completions/max_terminated_length": 911.0,
"completions/mean_length": 404.0,
"completions/mean_terminated_length": 404.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0175,
"frac_reward_zero_std": 0.125,
"grad_norm": 5.526406288146973,
"learning_rate": 9.993849845741523e-07,
"loss": 0.0,
"num_tokens": 1660916.0,
"reward": 0.296875,
"reward_std": 0.35833704471588135,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.25,
"rewards/mcq_exact_match_reward/mean": 0.25,
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1440.0,
"completions/mean_length": 540.46875,
"completions/mean_terminated_length": 491.83868408203125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.01875,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.893363952636719,
"learning_rate": 9.989068136093872e-07,
"loss": 0.0,
"num_tokens": 1783450.0,
"reward": 0.38203123211860657,
"reward_std": 0.30731916427612305,
"rewards/format_reward/mean": 0.3828125,
"rewards/format_reward/std": 0.2313210517168045,
"rewards/mcq_exact_match_reward/mean": 0.34375,
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1407.0,
"completions/max_terminated_length": 1407.0,
"completions/mean_length": 553.1875,
"completions/mean_terminated_length": 553.1875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.02,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.701499342918396,
"learning_rate": 9.982922465033348e-07,
"loss": -0.0,
"num_tokens": 1904958.0,
"reward": 0.5335937738418579,
"reward_std": 0.41488444805145264,
"rewards/format_reward/mean": 0.4921875,
"rewards/format_reward/std": 0.1406387835741043,
"rewards/mcq_exact_match_reward/mean": 0.484375,
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1545.0,
"completions/max_terminated_length": 1545.0,
"completions/mean_length": 344.5,
"completions/mean_terminated_length": 344.5,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.02125,
"frac_reward_zero_std": 0.375,
"grad_norm": 12.08929443359375,
"learning_rate": 9.975414512725056e-07,
"loss": -0.0,
"num_tokens": 2031982.0,
"reward": 0.500781238079071,
"reward_std": 0.28046733140945435,
"rewards/format_reward/mean": 0.4765625,
"rewards/format_reward/std": 0.10652101784944534,
"rewards/mcq_exact_match_reward/mean": 0.453125,
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1117.0,
"completions/mean_length": 473.9375,
"completions/mean_terminated_length": 448.952392578125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0225,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.033034324645996,
"learning_rate": 9.966546331768192e-07,
"loss": -0.0,
"num_tokens": 2146218.0,
"reward": 0.606249988079071,
"reward_std": 0.4029581844806671,
"rewards/format_reward/mean": 0.4375,
"rewards/format_reward/std": 0.24397502839565277,
"rewards/mcq_exact_match_reward/mean": 0.5625,
"rewards/mcq_exact_match_reward/std": 0.5,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1098.0,
"completions/max_terminated_length": 1098.0,
"completions/mean_length": 385.65625,
"completions/mean_terminated_length": 385.65625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.02375,
"frac_reward_zero_std": 0.0,
"grad_norm": 16.02350425720215,
"learning_rate": 9.956320346634875e-07,
"loss": -0.0,
"num_tokens": 2258940.0,
"reward": 0.36953121423721313,
"reward_std": 0.33910423517227173,
"rewards/format_reward/mean": 0.4140625,
"rewards/format_reward/std": 0.2280818521976471,
"rewards/mcq_exact_match_reward/mean": 0.328125,
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1207.0,
"completions/max_terminated_length": 1207.0,
"completions/mean_length": 400.625,
"completions/mean_terminated_length": 400.625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.025,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.332343101501465,
"learning_rate": 9.944739353007341e-07,
"loss": 0.0,
"num_tokens": 2363068.0,
"reward": 0.2867187559604645,
"reward_std": 0.3396008610725403,
"rewards/format_reward/mean": 0.5234375,
"rewards/format_reward/std": 0.2735668122768402,
"rewards/mcq_exact_match_reward/mean": 0.234375,
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1964.0,
"completions/max_terminated_length": 1964.0,
"completions/mean_length": 399.25,
"completions/mean_terminated_length": 399.25,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.02625,
"frac_reward_zero_std": 0.0,
"grad_norm": 11.555954933166504,
"learning_rate": 9.931806517013612e-07,
"loss": 0.0,
"num_tokens": 2465620.0,
"reward": 0.3179687559604645,
"reward_std": 0.44024717807769775,
"rewards/format_reward/mean": 0.5234375,
"rewards/format_reward/std": 0.2735668122768402,
"rewards/mcq_exact_match_reward/mean": 0.265625,
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1505.0,
"completions/max_terminated_length": 1505.0,
"completions/mean_length": 327.234375,
"completions/mean_terminated_length": 327.234375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0275,
"frac_reward_zero_std": 0.125,
"grad_norm": 4.289510726928711,
"learning_rate": 9.917525374361911e-07,
"loss": -0.0,
"num_tokens": 2565427.0,
"reward": 0.28984373807907104,
"reward_std": 0.3399752378463745,
"rewards/format_reward/mean": 0.5546875,
"rewards/format_reward/std": 0.2538151443004608,
"rewards/mcq_exact_match_reward/mean": 0.234375,
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 940.0,
"completions/max_terminated_length": 940.0,
"completions/mean_length": 297.296875,
"completions/mean_terminated_length": 297.296875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.02875,
"frac_reward_zero_std": 0.125,
"grad_norm": 13.854007720947266,
"learning_rate": 9.901899829374047e-07,
"loss": 0.0,
"num_tokens": 2667206.0,
"reward": 0.32890623807907104,
"reward_std": 0.2701229453086853,
"rewards/format_reward/mean": 0.4765625,
"rewards/format_reward/std": 0.20758795738220215,
"rewards/mcq_exact_match_reward/mean": 0.28125,
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1274.0,
"completions/max_terminated_length": 1274.0,
"completions/mean_length": 195.265625,
"completions/mean_terminated_length": 195.265625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.03,
"frac_reward_zero_std": 0.25,
"grad_norm": 16.784299850463867,
"learning_rate": 9.884934153917996e-07,
"loss": 0.0,
"num_tokens": 2755663.0,
"reward": 0.31640625,
"reward_std": 0.1695163995027542,
"rewards/format_reward/mean": 0.5078125,
"rewards/format_reward/std": 0.20877929031848907,
"rewards/mcq_exact_match_reward/mean": 0.265625,
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1681.0,
"completions/max_terminated_length": 1681.0,
"completions/mean_length": 253.875,
"completions/mean_terminated_length": 253.875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.03125,
"frac_reward_zero_std": 0.25,
"grad_norm": 25.41985511779785,
"learning_rate": 9.866632986240029e-07,
"loss": -0.0,
"num_tokens": 2837511.0,
"reward": 0.2749999761581421,
"reward_std": 0.20281967520713806,
"rewards/format_reward/mean": 0.5625,
"rewards/format_reward/std": 0.208927720785141,
"rewards/mcq_exact_match_reward/mean": 0.21875,
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1067.0,
"completions/max_terminated_length": 1067.0,
"completions/mean_length": 199.59375,
"completions/mean_terminated_length": 199.59375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0325,
"frac_reward_zero_std": 0.25,
"grad_norm": 8.237688064575195,
"learning_rate": 9.847001329696652e-07,
"loss": 0.0,
"num_tokens": 2942205.0,
"reward": 0.38984376192092896,
"reward_std": 0.24267949163913727,
"rewards/format_reward/mean": 0.6171875,
"rewards/format_reward/std": 0.21347814798355103,
"rewards/mcq_exact_match_reward/mean": 0.328125,
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 492.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 92.25,
"completions/mean_terminated_length": 92.25,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.03375,
"frac_reward_zero_std": 0.5,
"grad_norm": 15.814329147338867,
"learning_rate": 9.826044551386742e-07,
"loss": 0.0,
"num_tokens": 3037685.0,
"reward": 0.526562511920929,
"reward_std": 0.17365704476833344,
"rewards/format_reward/mean": 0.578125,
"rewards/format_reward/std": 0.23935678601264954,
"rewards/mcq_exact_match_reward/mean": 0.46875,
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 677.0,
"completions/max_terminated_length": 677.0,
"completions/mean_length": 136.734375,
"completions/mean_terminated_length": 136.734375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.035,
"frac_reward_zero_std": 0.375,
"grad_norm": 16.19939613342285,
"learning_rate": 9.803768380684242e-07,
"loss": -0.0,
"num_tokens": 3138908.0,
"reward": 0.4531249701976776,
"reward_std": 0.23708730936050415,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.2182178944349289,
"rewards/mcq_exact_match_reward/mean": 0.390625,
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 543.0,
"completions/max_terminated_length": 543.0,
"completions/mean_length": 85.0625,
"completions/mean_terminated_length": 85.0625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.03625,
"frac_reward_zero_std": 0.125,
"grad_norm": 20.77176284790039,
"learning_rate": 9.780178907671788e-07,
"loss": 0.0,
"num_tokens": 3233712.0,
"reward": 0.3414062261581421,
"reward_std": 0.28602826595306396,
"rewards/format_reward/mean": 0.6015625,
"rewards/format_reward/std": 0.2387082874774933,
"rewards/mcq_exact_match_reward/mean": 0.28125,
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 832.0,
"completions/max_terminated_length": 832.0,
"completions/mean_length": 79.84375,
"completions/mean_terminated_length": 79.84375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0375,
"frac_reward_zero_std": 0.5,
"grad_norm": 12.60765266418457,
"learning_rate": 9.755282581475767e-07,
"loss": 0.0,
"num_tokens": 3333926.0,
"reward": 0.16249999403953552,
"reward_std": 0.20131680369377136,
"rewards/format_reward/mean": 0.53125,
"rewards/format_reward/std": 0.1510545015335083,
"rewards/mcq_exact_match_reward/mean": 0.109375,
"rewards/mcq_exact_match_reward/std": 0.3145764470100403,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 555.0,
"completions/max_terminated_length": 555.0,
"completions/mean_length": 75.1875,
"completions/mean_terminated_length": 75.1875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.03875,
"frac_reward_zero_std": 0.375,
"grad_norm": 15.49142074584961,
"learning_rate": 9.729086208503173e-07,
"loss": -0.0,
"num_tokens": 3423410.0,
"reward": 0.6078124642372131,
"reward_std": 0.270576536655426,
"rewards/format_reward/mean": 0.609375,
"rewards/format_reward/std": 0.2083333432674408,
"rewards/mcq_exact_match_reward/mean": 0.546875,
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 817.0,
"completions/max_terminated_length": 817.0,
"completions/mean_length": 53.890625,
"completions/mean_terminated_length": 53.890625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.04,
"frac_reward_zero_std": 0.375,
"grad_norm": 22.353435516357422,
"learning_rate": 9.701596950580807e-07,
"loss": -0.0,
"num_tokens": 3516563.0,
"reward": 0.31953126192092896,
"reward_std": 0.2242286503314972,
"rewards/format_reward/mean": 0.6953125,
"rewards/format_reward/std": 0.24587368965148926,
"rewards/mcq_exact_match_reward/mean": 0.25,
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 172.0,
"completions/max_terminated_length": 172.0,
"completions/mean_length": 20.453125,
"completions/mean_terminated_length": 20.453125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.04125,
"frac_reward_zero_std": 0.5,
"grad_norm": 12.851003646850586,
"learning_rate": 9.672822322997304e-07,
"loss": 0.0,
"num_tokens": 3591016.0,
"reward": 0.668749988079071,
"reward_std": 0.2041158676147461,
"rewards/format_reward/mean": 0.75,
"rewards/format_reward/std": 0.2519763112068176,
"rewards/mcq_exact_match_reward/mean": 0.59375,
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 405.0,
"completions/max_terminated_length": 405.0,
"completions/mean_length": 58.453125,
"completions/mean_terminated_length": 58.453125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0425,
"frac_reward_zero_std": 0.125,
"grad_norm": 15.381024360656738,
"learning_rate": 9.642770192448535e-07,
"loss": 0.0,
"num_tokens": 3684013.0,
"reward": 0.5406250357627869,
"reward_std": 0.3712288737297058,
"rewards/format_reward/mean": 0.71875,
"rewards/format_reward/std": 0.25,
"rewards/mcq_exact_match_reward/mean": 0.46875,
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 620.0,
"completions/max_terminated_length": 620.0,
"completions/mean_length": 61.640625,
"completions/mean_terminated_length": 61.640625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.04375,
"frac_reward_zero_std": 0.25,
"grad_norm": 16.52124786376953,
"learning_rate": 9.611448774886923e-07,
"loss": -0.0,
"num_tokens": 3761286.0,
"reward": 0.33124998211860657,
"reward_std": 0.25726157426834106,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.12198751419782639,
"rewards/mcq_exact_match_reward/mean": 0.234375,
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 844.0,
"completions/mean_length": 94.890625,
"completions/mean_terminated_length": 63.888893127441406,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.045,
"frac_reward_zero_std": 0.5,
"grad_norm": 19.276613235473633,
"learning_rate": 9.578866633275286e-07,
"loss": 0.0,
"num_tokens": 3846599.0,
"reward": 0.31640625,
"reward_std": 0.2186937928199768,
"rewards/format_reward/mean": 0.8203125,
"rewards/format_reward/std": 0.2576941251754761,
"rewards/mcq_exact_match_reward/mean": 0.234375,
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 362.0,
"completions/max_terminated_length": 362.0,
"completions/mean_length": 44.015625,
"completions/mean_terminated_length": 44.015625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.04625,
"frac_reward_zero_std": 0.375,
"grad_norm": 28.070011138916016,
"learning_rate": 9.545032675245813e-07,
"loss": 0.0,
"num_tokens": 3917384.0,
"reward": 0.542187511920929,
"reward_std": 0.21420830488204956,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.2083333432674408,
"rewards/mcq_exact_match_reward/mean": 0.453125,
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 167.0,
"completions/mean_length": 70.375,
"completions/mean_terminated_length": 38.984130859375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0475,
"frac_reward_zero_std": 0.5,
"grad_norm": 24.695430755615234,
"learning_rate": 9.509956150664795e-07,
"loss": -0.0,
"num_tokens": 3998968.0,
"reward": 0.45078128576278687,
"reward_std": 0.18445391952991486,
"rewards/format_reward/mean": 0.9140625,
"rewards/format_reward/std": 0.209963858127594,
"rewards/mcq_exact_match_reward/mean": 0.359375,
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 559.0,
"completions/max_terminated_length": 559.0,
"completions/mean_length": 44.234375,
"completions/mean_terminated_length": 44.234375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.04875,
"frac_reward_zero_std": 0.625,
"grad_norm": 9.522443771362305,
"learning_rate": 9.473646649103817e-07,
"loss": -0.0,
"num_tokens": 4083311.0,
"reward": 0.2914062738418579,
"reward_std": 0.18989473581314087,
"rewards/format_reward/mean": 0.8828125,
"rewards/format_reward/std": 0.21347814798355103,
"rewards/mcq_exact_match_reward/mean": 0.203125,
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 225.0,
"completions/max_terminated_length": 225.0,
"completions/mean_length": 36.28125,
"completions/mean_terminated_length": 36.28125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.05,
"frac_reward_zero_std": 0.375,
"grad_norm": 36.65068054199219,
"learning_rate": 9.436114097218058e-07,
"loss": 0.0,
"num_tokens": 4155497.0,
"reward": 0.43906253576278687,
"reward_std": 0.20156370103359222,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.14689241349697113,
"rewards/mcq_exact_match_reward/mean": 0.34375,
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 297.0,
"completions/max_terminated_length": 297.0,
"completions/mean_length": 35.578125,
"completions/mean_terminated_length": 35.578125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.05125,
"frac_reward_zero_std": 0.25,
"grad_norm": 24.931636810302734,
"learning_rate": 9.397368756032444e-07,
"loss": -0.0,
"num_tokens": 4249622.0,
"reward": 0.5,
"reward_std": 0.23814013600349426,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.1666666716337204,
"rewards/mcq_exact_match_reward/mean": 0.40625,
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 285.0,
"completions/max_terminated_length": 285.0,
"completions/mean_length": 29.625,
"completions/mean_terminated_length": 29.625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0525,
"frac_reward_zero_std": 0.625,
"grad_norm": 15.338611602783203,
"learning_rate": 9.357421218136386e-07,
"loss": 0.0,
"num_tokens": 4333710.0,
"reward": 0.3531250059604645,
"reward_std": 0.1804211586713791,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.2182178944349289,
"rewards/mcq_exact_match_reward/mean": 0.265625,
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 710.0,
"completions/max_terminated_length": 710.0,
"completions/mean_length": 72.84375,
"completions/mean_terminated_length": 72.84375,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.05375,
"frac_reward_zero_std": 0.5,
"grad_norm": 10.507906913757324,
"learning_rate": 9.316282404787869e-07,
"loss": 0.0,
"num_tokens": 4440996.0,
"reward": 0.4296875,
"reward_std": 0.21748682856559753,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"rewards/mcq_exact_match_reward/mean": 0.34375,
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 101.0,
"completions/max_terminated_length": 101.0,
"completions/mean_length": 17.46875,
"completions/mean_terminated_length": 17.46875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.055,
"frac_reward_zero_std": 0.375,
"grad_norm": 16.530738830566406,
"learning_rate": 9.273963562927694e-07,
"loss": -0.0,
"num_tokens": 4527058.0,
"reward": 0.5625,
"reward_std": 0.2709311842918396,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.1666666716337204,
"rewards/mcq_exact_match_reward/mean": 0.46875,
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 274.0,
"completions/max_terminated_length": 274.0,
"completions/mean_length": 38.5625,
"completions/mean_terminated_length": 38.5625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.05625,
"frac_reward_zero_std": 0.5,
"grad_norm": 9.306171417236328,
"learning_rate": 9.230476262104676e-07,
"loss": 0.0,
"num_tokens": 4591334.0,
"reward": 0.6625000238418579,
"reward_std": 0.23356688022613525,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.5625,
"rewards/mcq_exact_match_reward/std": 0.5,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 664.0,
"completions/max_terminated_length": 664.0,
"completions/mean_length": 49.96875,
"completions/mean_terminated_length": 49.96875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0575,
"frac_reward_zero_std": 0.625,
"grad_norm": 8.561665534973145,
"learning_rate": 9.185832391312642e-07,
"loss": 0.0,
"num_tokens": 4671492.0,
"reward": 0.3890625238418579,
"reward_std": 0.1695934236049652,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.18298126757144928,
"rewards/mcq_exact_match_reward/mean": 0.296875,
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1079.0,
"completions/mean_length": 102.84375,
"completions/mean_terminated_length": 71.96826171875,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.05875,
"frac_reward_zero_std": 0.625,
"grad_norm": 5.362051486968994,
"learning_rate": 9.1400441557401e-07,
"loss": 0.0,
"num_tokens": 4742906.0,
"reward": 0.612500011920929,
"reward_std": 0.10678248107433319,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.1510545015335083,
"rewards/mcq_exact_match_reward/mean": 0.515625,
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 275.0,
"completions/max_terminated_length": 275.0,
"completions/mean_length": 43.265625,
"completions/mean_terminated_length": 43.265625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.06,
"frac_reward_zero_std": 0.5,
"grad_norm": 9.450302124023438,
"learning_rate": 9.093124073433462e-07,
"loss": -0.0,
"num_tokens": 4808035.0,
"reward": 0.4906250238418579,
"reward_std": 0.2109457552433014,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.390625,
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 264.0,
"completions/max_terminated_length": 264.0,
"completions/mean_length": 27.03125,
"completions/mean_terminated_length": 27.03125,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.06125,
"frac_reward_zero_std": 0.75,
"grad_norm": 7.115367889404297,
"learning_rate": 9.045084971874737e-07,
"loss": 0.0,
"num_tokens": 4889933.0,
"reward": 0.7406250238418579,
"reward_std": 0.12255740165710449,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.640625,
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 194.0,
"completions/max_terminated_length": 194.0,
"completions/mean_length": 28.4375,
"completions/mean_terminated_length": 28.4375,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.0625,
"frac_reward_zero_std": 0.75,
"grad_norm": 15.029036521911621,
"learning_rate": 8.995939984474623e-07,
"loss": 0.0,
"num_tokens": 4971121.0,
"reward": 0.8031250238418579,
"reward_std": 0.10205793380737305,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.703125,
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 771.0,
"completions/max_terminated_length": 771.0,
"completions/mean_length": 80.90625,
"completions/mean_terminated_length": 80.90625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.06375,
"frac_reward_zero_std": 0.375,
"grad_norm": 30.694612503051758,
"learning_rate": 8.945702546981968e-07,
"loss": 0.0,
"num_tokens": 5048643.0,
"reward": 0.6203124523162842,
"reward_std": 0.07164573669433594,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.2592533528804779,
"rewards/mcq_exact_match_reward/mean": 0.53125,
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 455.0,
"completions/max_terminated_length": 455.0,
"completions/mean_length": 37.765625,
"completions/mean_terminated_length": 37.765625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.065,
"frac_reward_zero_std": 0.75,
"grad_norm": 12.923490524291992,
"learning_rate": 8.894386393810562e-07,
"loss": 0.0,
"num_tokens": 5134100.0,
"reward": 0.5046875476837158,
"reward_std": 0.11330723762512207,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"rewards/mcq_exact_match_reward/mean": 0.40625,
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 473.0,
"completions/max_terminated_length": 473.0,
"completions/mean_length": 41.296875,
"completions/mean_terminated_length": 41.296875,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.06625,
"frac_reward_zero_std": 0.5,
"grad_norm": 12.523974418640137,
"learning_rate": 8.842005554284295e-07,
"loss": 0.0,
"num_tokens": 5252135.0,
"reward": 0.7250000238418579,
"reward_std": 0.22461533546447754,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.625,
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 441.0,
"completions/max_terminated_length": 441.0,
"completions/mean_length": 74.453125,
"completions/mean_terminated_length": 74.453125,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.0675,
"frac_reward_zero_std": 0.375,
"grad_norm": 15.70406723022461,
"learning_rate": 8.788574348801674e-07,
"loss": 0.0,
"num_tokens": 5334148.0,
"reward": 0.659375011920929,
"reward_std": 0.25896912813186646,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"rewards/mcq_exact_match_reward/mean": 0.5625,
"rewards/mcq_exact_match_reward/std": 0.5,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 671.0,
"completions/max_terminated_length": 671.0,
"completions/mean_length": 46.109375,
"completions/mean_terminated_length": 46.109375,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.06875,
"frac_reward_zero_std": 0.625,
"grad_norm": 23.37506103515625,
"learning_rate": 8.734107384920769e-07,
"loss": 0.0,
"num_tokens": 5411715.0,
"reward": 0.5531250238418579,
"reward_std": 0.17358146607875824,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.453125,
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 628.0,
"completions/max_terminated_length": 628.0,
"completions/mean_length": 40.359375,
"completions/mean_terminated_length": 40.359375,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.07,
"frac_reward_zero_std": 0.75,
"grad_norm": 16.58426856994629,
"learning_rate": 8.678619553365658e-07,
"loss": -0.0,
"num_tokens": 5495986.0,
"reward": 0.6312500238418579,
"reward_std": 0.0883883461356163,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.53125,
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 333.0,
"completions/max_terminated_length": 333.0,
"completions/mean_length": 49.171875,
"completions/mean_terminated_length": 49.171875,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.07125,
"frac_reward_zero_std": 0.875,
"grad_norm": 9.968092918395996,
"learning_rate": 8.622126023955445e-07,
"loss": 0.0,
"num_tokens": 5589749.0,
"reward": 0.6765625476837158,
"reward_std": 0.06687791645526886,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.08768405020236969,
"rewards/mcq_exact_match_reward/mean": 0.578125,
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 330.0,
"completions/max_terminated_length": 330.0,
"completions/mean_length": 45.828125,
"completions/mean_terminated_length": 45.828125,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.0725,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.7135562896728516,
"learning_rate": 8.564642241456986e-07,
"loss": -0.0,
"num_tokens": 5676570.0,
"reward": 0.6156250238418579,
"reward_std": 0.04419417306780815,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.515625,
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 458.0,
"completions/max_terminated_length": 458.0,
"completions/mean_length": 39.21875,
"completions/mean_terminated_length": 39.21875,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.07375,
"frac_reward_zero_std": 0.75,
"grad_norm": 17.85677146911621,
"learning_rate": 8.506183921362442e-07,
"loss": 0.0,
"num_tokens": 5756528.0,
"reward": 0.7406250238418579,
"reward_std": 0.10205793380737305,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.640625,
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 186.0,
"completions/max_terminated_length": 186.0,
"completions/mean_length": 22.78125,
"completions/mean_terminated_length": 22.78125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.075,
"frac_reward_zero_std": 0.75,
"grad_norm": 33.40819549560547,
"learning_rate": 8.446767045592829e-07,
"loss": 0.0,
"num_tokens": 5829058.0,
"reward": 0.7710937857627869,
"reward_std": 0.09984822571277618,
"rewards/format_reward/mean": 0.9921875,
"rewards/format_reward/std": 0.0625,
"rewards/mcq_exact_match_reward/mean": 0.671875,
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 179.0,
"completions/max_terminated_length": 179.0,
"completions/mean_length": 17.703125,
"completions/mean_terminated_length": 17.703125,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.07625,
"frac_reward_zero_std": 0.875,
"grad_norm": 6.3193039894104,
"learning_rate": 8.386407858128706e-07,
"loss": 0.0,
"num_tokens": 5903063.0,
"reward": 0.5062500238418579,
"reward_std": 0.0578637570142746,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.40625,
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 226.0,
"completions/max_terminated_length": 226.0,
"completions/mean_length": 33.96875,
"completions/mean_terminated_length": 33.96875,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.0775,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.325122860569241e-07,
"loss": 0.0,
"num_tokens": 5984173.0,
"reward": 0.9750000238418579,
"reward_std": 0.0,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.875,
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1581.0,
"completions/max_terminated_length": 1581.0,
"completions/mean_length": 96.734375,
"completions/mean_terminated_length": 96.734375,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.07875,
"frac_reward_zero_std": 0.5,
"grad_norm": 11.058859825134277,
"learning_rate": 8.262928807620843e-07,
"loss": -0.0,
"num_tokens": 6062100.0,
"reward": 0.6296875476837158,
"reward_std": 0.15962307155132294,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.08768405020236969,
"rewards/mcq_exact_match_reward/mean": 0.53125,
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 466.0,
"completions/max_terminated_length": 466.0,
"completions/mean_length": 34.109375,
"completions/mean_terminated_length": 34.109375,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.08,
"frac_reward_zero_std": 0.75,
"grad_norm": 6.3704962730407715,
"learning_rate": 8.199842702516582e-07,
"loss": -0.0,
"num_tokens": 6133635.0,
"reward": 0.4750000238418579,
"reward_std": 0.0883883461356163,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.375,
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 302.0,
"completions/max_terminated_length": 302.0,
"completions/mean_length": 30.375,
"completions/mean_terminated_length": 30.375,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.08125,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.135881792367685e-07,
"loss": 0.0,
"num_tokens": 6218427.0,
"reward": 0.6000000238418579,
"reward_std": 0.0,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.5,
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 389.0,
"completions/max_terminated_length": 389.0,
"completions/mean_length": 44.90625,
"completions/mean_terminated_length": 44.90625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.0825,
"frac_reward_zero_std": 0.875,
"grad_norm": 6.963454723358154,
"learning_rate": 8.071063563448339e-07,
"loss": -0.0,
"num_tokens": 6304541.0,
"reward": 0.9117187857627869,
"reward_std": 0.06768143177032471,
"rewards/format_reward/mean": 0.9921875,
"rewards/format_reward/std": 0.0625,
"rewards/mcq_exact_match_reward/mean": 0.8125,
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 115.0,
"completions/max_terminated_length": 115.0,
"completions/mean_length": 15.109375,
"completions/mean_terminated_length": 15.109375,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.08375,
"frac_reward_zero_std": 0.625,
"grad_norm": 10.610746383666992,
"learning_rate": 8.005405736415125e-07,
"loss": -0.0,
"num_tokens": 6382612.0,
"reward": 0.6929687857627869,
"reward_std": 0.09059805423021317,
"rewards/format_reward/mean": 0.9921875,
"rewards/format_reward/std": 0.0625,
"rewards/mcq_exact_match_reward/mean": 0.59375,
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 161.0,
"completions/max_terminated_length": 161.0,
"completions/mean_length": 21.765625,
"completions/mean_terminated_length": 21.765625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.085,
"frac_reward_zero_std": 0.875,
"grad_norm": 12.212800979614258,
"learning_rate": 7.938926261462365e-07,
"loss": 0.0,
"num_tokens": 6446757.0,
"reward": 0.7875000238418579,
"reward_std": 0.06681530922651291,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.6875,
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 312.0,
"completions/mean_length": 71.140625,
"completions/mean_terminated_length": 39.761905670166016,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.08625,
"frac_reward_zero_std": 0.75,
"grad_norm": 9.373642921447754,
"learning_rate": 7.871643313414718e-07,
"loss": 0.0,
"num_tokens": 6524198.0,
"reward": 1.0046875476837158,
"reward_std": 0.11330723762512207,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"rewards/mcq_exact_match_reward/mean": 0.90625,
"rewards/mcq_exact_match_reward/std": 0.29378482699394226,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 222.0,
"completions/max_terminated_length": 222.0,
"completions/mean_length": 27.703125,
"completions/mean_terminated_length": 27.703125,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.0875,
"frac_reward_zero_std": 0.75,
"grad_norm": 14.864502906799316,
"learning_rate": 7.803575286758363e-07,
"loss": 0.0,
"num_tokens": 6599307.0,
"reward": 0.8187500238418579,
"reward_std": 0.0883883461356163,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.71875,
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 199.0,
"completions/max_terminated_length": 199.0,
"completions/mean_length": 32.71875,
"completions/mean_terminated_length": 32.71875,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.08875,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.8891595602035522,
"learning_rate": 7.734740790612136e-07,
"loss": 0.0,
"num_tokens": 6682921.0,
"reward": 0.7093750238418579,
"reward_std": 0.04419417306780815,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.609375,
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 556.0,
"completions/max_terminated_length": 556.0,
"completions/mean_length": 27.71875,
"completions/mean_terminated_length": 27.71875,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.09,
"frac_reward_zero_std": 0.875,
"grad_norm": 3.046689033508301,
"learning_rate": 7.665158643639969e-07,
"loss": -0.0,
"num_tokens": 6767743.0,
"reward": 0.8492187857627869,
"reward_std": 0.00220970856025815,
"rewards/format_reward/mean": 0.9921875,
"rewards/format_reward/std": 0.0625,
"rewards/mcq_exact_match_reward/mean": 0.75,
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 260.0,
"completions/max_terminated_length": 260.0,
"completions/mean_length": 22.734375,
"completions/mean_terminated_length": 22.734375,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.09125,
"frac_reward_zero_std": 0.875,
"grad_norm": 7.001441955566406,
"learning_rate": 7.594847868906076e-07,
"loss": 0.0,
"num_tokens": 6864566.0,
"reward": 0.8968750238418579,
"reward_std": 0.0646936446428299,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.796875,
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 12.0625,
"completions/mean_terminated_length": 12.0625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.0925,
"frac_reward_zero_std": 0.75,
"grad_norm": 32.40105056762695,
"learning_rate": 7.523827688674219e-07,
"loss": 0.0,
"num_tokens": 6921450.0,
"reward": 0.7718750238418579,
"reward_std": 0.11100947856903076,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.671875,
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 179.0,
"completions/max_terminated_length": 179.0,
"completions/mean_length": 14.734375,
"completions/mean_terminated_length": 14.734375,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.09375,
"frac_reward_zero_std": 0.625,
"grad_norm": 17.64237403869629,
"learning_rate": 7.452117519152541e-07,
"loss": 0.0,
"num_tokens": 7008801.0,
"reward": 0.7093750238418579,
"reward_std": 0.189372718334198,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.609375,
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 148.0,
"completions/max_terminated_length": 148.0,
"completions/mean_length": 15.921875,
"completions/mean_terminated_length": 15.921875,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.095,
"frac_reward_zero_std": 0.625,
"grad_norm": 17.417652130126953,
"learning_rate": 7.379736965185368e-07,
"loss": -0.0,
"num_tokens": 7089380.0,
"reward": 0.4281250238418579,
"reward_std": 0.1530819982290268,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.328125,
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 290.0,
"completions/max_terminated_length": 290.0,
"completions/mean_length": 16.765625,
"completions/mean_terminated_length": 16.765625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.09625,
"frac_reward_zero_std": 0.75,
"grad_norm": 15.111842155456543,
"learning_rate": 7.306705814893439e-07,
"loss": 0.0,
"num_tokens": 7194789.0,
"reward": 0.45781251788139343,
"reward_std": 0.04861358925700188,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"rewards/mcq_exact_match_reward/mean": 0.359375,
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 12.03125,
"completions/mean_terminated_length": 12.03125,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.0975,
"frac_reward_zero_std": 0.75,
"grad_norm": 30.111845016479492,
"learning_rate": 7.233044034264033e-07,
"loss": 0.0,
"num_tokens": 7284127.0,
"reward": 0.2718750238418579,
"reward_std": 0.10205793380737305,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.171875,
"rewards/mcq_exact_match_reward/std": 0.38025420904159546,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 14.0,
"completions/max_terminated_length": 14.0,
"completions/mean_length": 12.0625,
"completions/mean_terminated_length": 12.0625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.09875,
"frac_reward_zero_std": 0.875,
"grad_norm": 35.65519714355469,
"learning_rate": 7.158771761692464e-07,
"loss": 0.0,
"num_tokens": 7379515.0,
"reward": 0.8187500238418579,
"reward_std": 0.0578637570142746,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.71875,
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 66.0,
"completions/max_terminated_length": 66.0,
"completions/mean_length": 12.90625,
"completions/mean_terminated_length": 12.90625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1,
"frac_reward_zero_std": 0.875,
"grad_norm": 20.51276206970215,
"learning_rate": 7.083909302476452e-07,
"loss": 0.0,
"num_tokens": 7446461.0,
"reward": 0.6781250238418579,
"reward_std": 0.0646936446428299,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.578125,
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 12.09375,
"completions/mean_terminated_length": 12.09375,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.10125,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.008477123264847e-07,
"loss": 0.0,
"num_tokens": 7561899.0,
"reward": 0.6000000238418579,
"reward_std": 0.0,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.5,
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 557.0,
"completions/max_terminated_length": 557.0,
"completions/mean_length": 20.515625,
"completions/mean_terminated_length": 20.515625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1025,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.932495846462261e-07,
"loss": 0.0,
"num_tokens": 7644644.0,
"reward": 0.7250000238418579,
"reward_std": 0.0,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.625,
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 12.015625,
"completions/mean_terminated_length": 12.015625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.10375,
"frac_reward_zero_std": 0.875,
"grad_norm": 6.890464782714844,
"learning_rate": 6.855986244591103e-07,
"loss": -0.0,
"num_tokens": 7729413.0,
"reward": 0.6156250238418579,
"reward_std": 0.04419417306780815,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.515625,
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 14.0,
"completions/max_terminated_length": 14.0,
"completions/mean_length": 12.140625,
"completions/mean_terminated_length": 12.140625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.105,
"frac_reward_zero_std": 0.75,
"grad_norm": 19.9100341796875,
"learning_rate": 6.778969234612583e-07,
"loss": -0.0,
"num_tokens": 7815534.0,
"reward": 0.7906250357627869,
"reward_std": 0.10205793380737305,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"rewards/mcq_exact_match_reward/mean": 0.703125,
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 12.0625,
"completions/mean_terminated_length": 12.0625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.10625,
"frac_reward_zero_std": 0.875,
"grad_norm": 18.460962295532227,
"learning_rate": 6.701465872208216e-07,
"loss": 0.0,
"num_tokens": 7896298.0,
"reward": 0.7875000238418579,
"reward_std": 0.06681530922651291,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.6875,
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 12.03125,
"completions/mean_terminated_length": 12.03125,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1075,
"frac_reward_zero_std": 0.625,
"grad_norm": 30.060848236083984,
"learning_rate": 6.623497346023417e-07,
"loss": 0.0,
"num_tokens": 7974052.0,
"reward": 0.8031250238418579,
"reward_std": 0.1530819982290268,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.703125,
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 101.0,
"completions/max_terminated_length": 101.0,
"completions/mean_length": 13.453125,
"completions/mean_terminated_length": 13.453125,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.10875,
"frac_reward_zero_std": 0.875,
"grad_norm": 20.207021713256836,
"learning_rate": 6.545084971874736e-07,
"loss": 0.0,
"num_tokens": 8049993.0,
"reward": 0.7562500238418579,
"reward_std": 0.0578637570142746,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.65625,
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 54.0,
"completions/max_terminated_length": 54.0,
"completions/mean_length": 12.65625,
"completions/mean_terminated_length": 12.65625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.11,
"frac_reward_zero_std": 0.875,
"grad_norm": 18.510467529296875,
"learning_rate": 6.466250186922324e-07,
"loss": -0.0,
"num_tokens": 8119723.0,
"reward": 0.6625000238418579,
"reward_std": 0.06681530922651291,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.5625,
"rewards/mcq_exact_match_reward/std": 0.5,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 12.046875,
"completions/mean_terminated_length": 12.046875,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.11125,
"frac_reward_zero_std": 0.75,
"grad_norm": 26.590150833129883,
"learning_rate": 6.387014543809223e-07,
"loss": 0.0,
"num_tokens": 8201878.0,
"reward": 0.7250000238418579,
"reward_std": 0.13363061845302582,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.625,
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 444.0,
"completions/max_terminated_length": 444.0,
"completions/mean_length": 22.1875,
"completions/mean_terminated_length": 22.1875,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1125,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.3266140222549438,
"learning_rate": 6.307399704769098e-07,
"loss": -0.0,
"num_tokens": 8289162.0,
"reward": 0.8492187261581421,
"reward_std": 0.00220970856025815,
"rewards/format_reward/mean": 0.9921875,
"rewards/format_reward/std": 0.0625,
"rewards/mcq_exact_match_reward/mean": 0.75,
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 12.03125,
"completions/mean_terminated_length": 12.03125,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.11375,
"frac_reward_zero_std": 0.625,
"grad_norm": 27.705215454101562,
"learning_rate": 6.227427435703995e-07,
"loss": -0.0,
"num_tokens": 8378164.0,
"reward": 0.4828125238418579,
"reward_std": 0.12902677059173584,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"rewards/mcq_exact_match_reward/mean": 0.390625,
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 12.0625,
"completions/mean_terminated_length": 12.0625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.115,
"frac_reward_zero_std": 0.75,
"grad_norm": 25.083688735961914,
"learning_rate": 6.147119600233758e-07,
"loss": -0.0,
"num_tokens": 8471760.0,
"reward": 0.9109375476837158,
"reward_std": 0.11871248483657837,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"rewards/mcq_exact_match_reward/mean": 0.8125,
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 12.015625,
"completions/mean_terminated_length": 12.015625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.11625,
"frac_reward_zero_std": 0.75,
"grad_norm": 22.57097053527832,
"learning_rate": 6.066498153718734e-07,
"loss": 0.0,
"num_tokens": 8532233.0,
"reward": 0.6781250238418579,
"reward_std": 0.10205793380737305,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.578125,
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1175,
"frac_reward_zero_std": 0.625,
"grad_norm": 21.33436393737793,
"learning_rate": 5.985585137257401e-07,
"loss": 0.0,
"num_tokens": 8625297.0,
"reward": 0.6156250238418579,
"reward_std": 0.189372718334198,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.515625,
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.11875,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 5.90440267166055e-07,
"loss": 0.0,
"num_tokens": 8693017.0,
"reward": 0.9750000238418579,
"reward_std": 0.0,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.875,
"rewards/mcq_exact_match_reward/std": 0.3333333432674408,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.12,
"frac_reward_zero_std": 0.875,
"grad_norm": 17.82723617553711,
"learning_rate": 5.82297295140367e-07,
"loss": 0.0,
"num_tokens": 8762857.0,
"reward": 0.5062500238418579,
"reward_std": 0.0578637570142746,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.40625,
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.12125,
"frac_reward_zero_std": 0.875,
"grad_norm": 12.498894691467285,
"learning_rate": 5.741318238559209e-07,
"loss": 0.0,
"num_tokens": 8837825.0,
"reward": 0.7406250238418579,
"reward_std": 0.04419417306780815,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.640625,
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 14.0,
"completions/max_terminated_length": 14.0,
"completions/mean_length": 12.0625,
"completions/mean_terminated_length": 12.0625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1225,
"frac_reward_zero_std": 0.875,
"grad_norm": 7.857953071594238,
"learning_rate": 5.659460856710345e-07,
"loss": 0.0,
"num_tokens": 8930893.0,
"reward": 0.5984375476837158,
"reward_std": 0.0044194171205163,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"rewards/mcq_exact_match_reward/mean": 0.5,
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 12.140625,
"completions/mean_terminated_length": 12.140625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.12375,
"frac_reward_zero_std": 0.75,
"grad_norm": 16.862316131591797,
"learning_rate": 5.577423184847931e-07,
"loss": -0.0,
"num_tokens": 9008550.0,
"reward": 0.5531250238418579,
"reward_std": 0.10205793380737305,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.453125,
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.125,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 5.495227651252315e-07,
"loss": 0.0,
"num_tokens": 9088822.0,
"reward": 0.7250000238418579,
"reward_std": 0.0,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.625,
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 9088822,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}