Files
ModelHub XC b27438d51a 初始化项目,由ModelHub XC社区提供模型
Model: cjiao/goldengoose-divsweep_goose_n512_indorc_tau0.10-7grp
Source: Original Platform
2026-06-27 22:07:22 +08:00

1385 lines
50 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.11160714285714286,
"eval_steps": 500,
"global_step": 50,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1786.0,
"completions/max_terminated_length": 1786.0,
"completions/mean_length": 541.859375,
"completions/mean_terminated_length": 541.859375,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.002232142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.971773147583008,
"learning_rate": 0.0,
"loss": -0.0,
"num_tokens": 112399.0,
"reward": 0.25312498211860657,
"reward_std": 0.35192999243736267,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.25,
"rewards/mcq_exact_match_reward/mean": 0.21875,
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1913.0,
"completions/max_terminated_length": 1913.0,
"completions/mean_length": 460.625,
"completions/mean_terminated_length": 460.625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.004464285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.931671142578125,
"learning_rate": 2e-07,
"loss": -0.0,
"num_tokens": 239831.0,
"reward": 0.33671873807907104,
"reward_std": 0.35391804575920105,
"rewards/format_reward/mean": 0.3984375,
"rewards/format_reward/std": 0.20275264978408813,
"rewards/mcq_exact_match_reward/mean": 0.296875,
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1663.0,
"completions/max_terminated_length": 1663.0,
"completions/mean_length": 578.5625,
"completions/mean_terminated_length": 578.5625,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.006696428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.283196210861206,
"learning_rate": 4e-07,
"loss": 0.0,
"num_tokens": 357427.0,
"reward": 0.28125,
"reward_std": 0.3595561385154724,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.279951810836792,
"rewards/mcq_exact_match_reward/mean": 0.234375,
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1725.0,
"completions/mean_length": 508.953125,
"completions/mean_terminated_length": 484.5238342285156,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.008928571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.620399475097656,
"learning_rate": 6e-07,
"loss": 0.0,
"num_tokens": 498696.0,
"reward": 0.38203126192092896,
"reward_std": 0.4592018127441406,
"rewards/format_reward/mean": 0.3828125,
"rewards/format_reward/std": 0.2634054720401764,
"rewards/mcq_exact_match_reward/mean": 0.34375,
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1176.0,
"completions/max_terminated_length": 1176.0,
"completions/mean_length": 443.328125,
"completions/mean_terminated_length": 443.328125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.011160714285714286,
"frac_reward_zero_std": 0.125,
"grad_norm": 23.866165161132812,
"learning_rate": 8e-07,
"loss": -0.0,
"num_tokens": 606045.0,
"reward": 0.23671872913837433,
"reward_std": 0.23968853056430817,
"rewards/format_reward/mean": 0.3359375,
"rewards/format_reward/std": 0.2824873626232147,
"rewards/mcq_exact_match_reward/mean": 0.203125,
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1733.0,
"completions/mean_length": 569.5,
"completions/mean_terminated_length": 521.8064575195312,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.013392857142857142,
"frac_reward_zero_std": 0.25,
"grad_norm": 4.394739627838135,
"learning_rate": 1e-06,
"loss": -0.0,
"num_tokens": 764053.0,
"reward": 0.3164062201976776,
"reward_std": 0.24395309388637543,
"rewards/format_reward/mean": 0.3515625,
"rewards/format_reward/std": 0.2302463799715042,
"rewards/mcq_exact_match_reward/mean": 0.28125,
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1963.0,
"completions/mean_length": 531.71875,
"completions/mean_terminated_length": 507.65081787109375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.015625,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.567261695861816,
"learning_rate": 9.99726628670463e-07,
"loss": -0.0,
"num_tokens": 895587.0,
"reward": 0.2867187261581421,
"reward_std": 0.34152650833129883,
"rewards/format_reward/mean": 0.3671875,
"rewards/format_reward/std": 0.22257846593856812,
"rewards/mcq_exact_match_reward/mean": 0.25,
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1868.0,
"completions/max_terminated_length": 1868.0,
"completions/mean_length": 575.71875,
"completions/mean_terminated_length": 575.71875,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.017857142857142856,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.7189435958862305,
"learning_rate": 9.989068136093872e-07,
"loss": 0.0,
"num_tokens": 1015801.0,
"reward": 0.26093748211860657,
"reward_std": 0.2545996308326721,
"rewards/format_reward/mean": 0.421875,
"rewards/format_reward/std": 0.20351573824882507,
"rewards/mcq_exact_match_reward/mean": 0.21875,
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1845.0,
"completions/mean_length": 584.140625,
"completions/mean_terminated_length": 560.90478515625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.020089285714285716,
"frac_reward_zero_std": 0.125,
"grad_norm": 6.5300421714782715,
"learning_rate": 9.975414512725056e-07,
"loss": -0.0,
"num_tokens": 1153538.0,
"reward": 0.3515625,
"reward_std": 0.40456390380859375,
"rewards/format_reward/mean": 0.390625,
"rewards/format_reward/std": 0.24346621334552765,
"rewards/mcq_exact_match_reward/mean": 0.3125,
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1695.0,
"completions/mean_length": 586.046875,
"completions/mean_terminated_length": 562.84130859375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.022321428571428572,
"frac_reward_zero_std": 0.25,
"grad_norm": 4.844789028167725,
"learning_rate": 9.956320346634875e-07,
"loss": -0.0,
"num_tokens": 1279109.0,
"reward": 0.31874996423721313,
"reward_std": 0.30643922090530396,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.2182178944349289,
"rewards/mcq_exact_match_reward/mean": 0.28125,
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2034.0,
"completions/mean_length": 403.140625,
"completions/mean_terminated_length": 350.08062744140625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.024553571428571428,
"frac_reward_zero_std": 0.25,
"grad_norm": 25.20620346069336,
"learning_rate": 9.931806517013612e-07,
"loss": -0.0,
"num_tokens": 1407374.0,
"reward": 0.35468748211860657,
"reward_std": 0.24326452612876892,
"rewards/format_reward/mean": 0.421875,
"rewards/format_reward/std": 0.2221602201461792,
"rewards/mcq_exact_match_reward/mean": 0.3125,
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1665.0,
"completions/mean_length": 540.59375,
"completions/mean_terminated_length": 491.96771240234375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.026785714285714284,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.811991691589355,
"learning_rate": 9.901899829374047e-07,
"loss": -0.0,
"num_tokens": 1536228.0,
"reward": 0.2890624701976776,
"reward_std": 0.38433414697647095,
"rewards/format_reward/mean": 0.390625,
"rewards/format_reward/std": 0.24346621334552765,
"rewards/mcq_exact_match_reward/mean": 0.25,
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1231.0,
"completions/mean_length": 437.3125,
"completions/mean_terminated_length": 385.3548278808594,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.029017857142857144,
"frac_reward_zero_std": 0.125,
"grad_norm": 9.110626220703125,
"learning_rate": 9.866632986240029e-07,
"loss": -0.0,
"num_tokens": 1661912.0,
"reward": 0.38593748211860657,
"reward_std": 0.3386583626270294,
"rewards/format_reward/mean": 0.421875,
"rewards/format_reward/std": 0.18298126757144928,
"rewards/mcq_exact_match_reward/mean": 0.34375,
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1989.0,
"completions/max_terminated_length": 1989.0,
"completions/mean_length": 396.1875,
"completions/mean_terminated_length": 396.1875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.03125,
"frac_reward_zero_std": 0.0,
"grad_norm": 22.122051239013672,
"learning_rate": 9.826044551386742e-07,
"loss": 0.0,
"num_tokens": 1785052.0,
"reward": 0.5171874761581421,
"reward_std": 0.37966397404670715,
"rewards/format_reward/mean": 0.484375,
"rewards/format_reward/std": 0.29504841566085815,
"rewards/mcq_exact_match_reward/mean": 0.46875,
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1319.0,
"completions/mean_length": 398.1875,
"completions/mean_terminated_length": 344.9677429199219,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.033482142857142856,
"frac_reward_zero_std": 0.25,
"grad_norm": 8.21301555633545,
"learning_rate": 9.780178907671788e-07,
"loss": -0.0,
"num_tokens": 1906464.0,
"reward": 0.23906248807907104,
"reward_std": 0.214445561170578,
"rewards/format_reward/mean": 0.515625,
"rewards/format_reward/std": 0.1985812783241272,
"rewards/mcq_exact_match_reward/mean": 0.1875,
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1598.0,
"completions/mean_length": 539.109375,
"completions/mean_terminated_length": 411.2372741699219,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.03571428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 17.956470489501953,
"learning_rate": 9.729086208503173e-07,
"loss": -0.0,
"num_tokens": 2039351.0,
"reward": 0.4976562261581421,
"reward_std": 0.4534168541431427,
"rewards/format_reward/mean": 0.4453125,
"rewards/format_reward/std": 0.26899561285972595,
"rewards/mcq_exact_match_reward/mean": 0.453125,
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1142.0,
"completions/mean_length": 333.8125,
"completions/mean_terminated_length": 306.6031799316406,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.03794642857142857,
"frac_reward_zero_std": 0.125,
"grad_norm": 19.72380256652832,
"learning_rate": 9.672822322997304e-07,
"loss": -0.0,
"num_tokens": 2139947.0,
"reward": 0.3046875,
"reward_std": 0.3565414547920227,
"rewards/format_reward/mean": 0.546875,
"rewards/format_reward/std": 0.2916666865348816,
"rewards/mcq_exact_match_reward/mean": 0.25,
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1142.0,
"completions/mean_length": 248.0625,
"completions/mean_terminated_length": 219.49208068847656,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.04017857142857143,
"frac_reward_zero_std": 0.375,
"grad_norm": 27.903902053833008,
"learning_rate": 9.611448774886923e-07,
"loss": 0.0,
"num_tokens": 2281071.0,
"reward": 0.18984374403953552,
"reward_std": 0.21500104665756226,
"rewards/format_reward/mean": 0.4921875,
"rewards/format_reward/std": 0.18881812691688538,
"rewards/mcq_exact_match_reward/mean": 0.140625,
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1478.0,
"completions/mean_length": 237.8125,
"completions/mean_terminated_length": 209.07937622070312,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.04241071428571429,
"frac_reward_zero_std": 0.125,
"grad_norm": 23.812612533569336,
"learning_rate": 9.545032675245813e-07,
"loss": -0.0,
"num_tokens": 2410507.0,
"reward": 0.3968749940395355,
"reward_std": 0.37050747871398926,
"rewards/format_reward/mean": 0.53125,
"rewards/format_reward/std": 0.21593283116817474,
"rewards/mcq_exact_match_reward/mean": 0.34375,
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 832.0,
"completions/max_terminated_length": 832.0,
"completions/mean_length": 102.78125,
"completions/mean_terminated_length": 102.78125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.044642857142857144,
"frac_reward_zero_std": 0.125,
"grad_norm": 17.30659294128418,
"learning_rate": 9.473646649103817e-07,
"loss": -0.0,
"num_tokens": 2492773.0,
"reward": 0.5757812261581421,
"reward_std": 0.4401569366455078,
"rewards/format_reward/mean": 0.6015625,
"rewards/format_reward/std": 0.28423789143562317,
"rewards/mcq_exact_match_reward/mean": 0.515625,
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 685.0,
"completions/mean_length": 71.796875,
"completions/mean_terminated_length": 40.42857360839844,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.046875,
"frac_reward_zero_std": 0.625,
"grad_norm": 10.079203605651855,
"learning_rate": 9.397368756032444e-07,
"loss": 0.0,
"num_tokens": 2624568.0,
"reward": 0.10312500596046448,
"reward_std": 0.10811922699213028,
"rewards/format_reward/mean": 0.5625,
"rewards/format_reward/std": 0.22712838649749756,
"rewards/mcq_exact_match_reward/mean": 0.046875,
"rewards/mcq_exact_match_reward/std": 0.21304203569889069,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1416.0,
"completions/max_terminated_length": 1416.0,
"completions/mean_length": 105.59375,
"completions/mean_terminated_length": 105.59375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.049107142857142856,
"frac_reward_zero_std": 0.375,
"grad_norm": 15.665273666381836,
"learning_rate": 9.316282404787869e-07,
"loss": -0.0,
"num_tokens": 2746590.0,
"reward": 0.3062500059604645,
"reward_std": 0.26257947087287903,
"rewards/format_reward/mean": 0.5625,
"rewards/format_reward/std": 0.18898223340511322,
"rewards/mcq_exact_match_reward/mean": 0.25,
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1835.0,
"completions/mean_length": 111.84375,
"completions/mean_terminated_length": 81.11111450195312,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.05133928571428571,
"frac_reward_zero_std": 0.375,
"grad_norm": 17.90364646911621,
"learning_rate": 9.230476262104676e-07,
"loss": -0.0,
"num_tokens": 2852316.0,
"reward": 0.26093751192092896,
"reward_std": 0.11917313188314438,
"rewards/format_reward/mean": 0.578125,
"rewards/format_reward/std": 0.2221602201461792,
"rewards/mcq_exact_match_reward/mean": 0.203125,
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 721.0,
"completions/max_terminated_length": 721.0,
"completions/mean_length": 59.4375,
"completions/mean_terminated_length": 59.4375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.05357142857142857,
"frac_reward_zero_std": 0.125,
"grad_norm": 23.127439498901367,
"learning_rate": 9.1400441557401e-07,
"loss": 0.0,
"num_tokens": 2948880.0,
"reward": 0.4039062261581421,
"reward_std": 0.3720070719718933,
"rewards/format_reward/mean": 0.6015625,
"rewards/format_reward/std": 0.28423789143562317,
"rewards/mcq_exact_match_reward/mean": 0.34375,
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 473.0,
"completions/max_terminated_length": 473.0,
"completions/mean_length": 38.484375,
"completions/mean_terminated_length": 38.484375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.05580357142857143,
"frac_reward_zero_std": 0.125,
"grad_norm": 23.131183624267578,
"learning_rate": 9.045084971874737e-07,
"loss": -0.0,
"num_tokens": 3042983.0,
"reward": 0.49609375,
"reward_std": 0.39612793922424316,
"rewards/format_reward/mean": 0.5859375,
"rewards/format_reward/std": 0.244862899184227,
"rewards/mcq_exact_match_reward/mean": 0.4375,
"rewards/mcq_exact_match_reward/std": 0.5,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 396.0,
"completions/max_terminated_length": 396.0,
"completions/mean_length": 14.578125,
"completions/mean_terminated_length": 14.578125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.05803571428571429,
"frac_reward_zero_std": 0.375,
"grad_norm": 21.188722610473633,
"learning_rate": 8.945702546981968e-07,
"loss": -0.0,
"num_tokens": 3153444.0,
"reward": 0.26874998211860657,
"reward_std": 0.29173365235328674,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.08908708393573761,
"rewards/mcq_exact_match_reward/mean": 0.21875,
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 8.234375,
"completions/mean_terminated_length": 8.234375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.060267857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 31.691007614135742,
"learning_rate": 8.842005554284295e-07,
"loss": 0.0,
"num_tokens": 3245059.0,
"reward": 0.3148437738418579,
"reward_std": 0.4270421266555786,
"rewards/format_reward/mean": 0.6484375,
"rewards/format_reward/std": 0.2302463799715042,
"rewards/mcq_exact_match_reward/mean": 0.25,
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 910.0,
"completions/max_terminated_length": 910.0,
"completions/mean_length": 23.265625,
"completions/mean_terminated_length": 23.265625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0625,
"frac_reward_zero_std": 0.375,
"grad_norm": 24.359390258789062,
"learning_rate": 8.734107384920769e-07,
"loss": 0.0,
"num_tokens": 3361140.0,
"reward": 0.40156251192092896,
"reward_std": 0.28653684258461,
"rewards/format_reward/mean": 0.578125,
"rewards/format_reward/std": 0.18298126757144928,
"rewards/mcq_exact_match_reward/mean": 0.34375,
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 237.0,
"completions/max_terminated_length": 237.0,
"completions/mean_length": 9.609375,
"completions/mean_terminated_length": 9.609375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.06473214285714286,
"frac_reward_zero_std": 0.625,
"grad_norm": 17.99944305419922,
"learning_rate": 8.622126023955445e-07,
"loss": 0.0,
"num_tokens": 3490331.0,
"reward": 0.20546874403953552,
"reward_std": 0.17656923830509186,
"rewards/format_reward/mean": 0.4921875,
"rewards/format_reward/std": 0.0625,
"rewards/mcq_exact_match_reward/mean": 0.15625,
"rewards/mcq_exact_match_reward/std": 0.36596253514289856,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 6.625,
"completions/mean_terminated_length": 6.625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.06696428571428571,
"frac_reward_zero_std": 0.75,
"grad_norm": 20.938568115234375,
"learning_rate": 8.506183921362442e-07,
"loss": 0.0,
"num_tokens": 3605843.0,
"reward": 0.2874999940395355,
"reward_std": 0.11888101696968079,
"rewards/format_reward/mean": 0.53125,
"rewards/format_reward/std": 0.12198751419782639,
"rewards/mcq_exact_match_reward/mean": 0.234375,
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 33.0,
"completions/max_terminated_length": 33.0,
"completions/mean_length": 6.421875,
"completions/mean_terminated_length": 6.421875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.06919642857142858,
"frac_reward_zero_std": 0.5,
"grad_norm": 20.33233070373535,
"learning_rate": 8.386407858128706e-07,
"loss": -0.0,
"num_tokens": 3710350.0,
"reward": 0.44062498211860657,
"reward_std": 0.210945725440979,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.390625,
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 6.109375,
"completions/mean_terminated_length": 6.109375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.07142857142857142,
"frac_reward_zero_std": 0.5,
"grad_norm": 26.781448364257812,
"learning_rate": 8.262928807620843e-07,
"loss": 0.0,
"num_tokens": 3807701.0,
"reward": 0.2984374761581421,
"reward_std": 0.16964475810527802,
"rewards/format_reward/mean": 0.484375,
"rewards/format_reward/std": 0.08768405020236969,
"rewards/mcq_exact_match_reward/mean": 0.25,
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 6.84375,
"completions/mean_terminated_length": 6.84375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.07366071428571429,
"frac_reward_zero_std": 0.5,
"grad_norm": 22.09980583190918,
"learning_rate": 8.135881792367685e-07,
"loss": 0.0,
"num_tokens": 3908627.0,
"reward": 0.39921873807907104,
"reward_std": 0.1689612716436386,
"rewards/format_reward/mean": 0.5546875,
"rewards/format_reward/std": 0.180765300989151,
"rewards/mcq_exact_match_reward/mean": 0.34375,
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 568.0,
"completions/max_terminated_length": 568.0,
"completions/mean_length": 19.828125,
"completions/mean_terminated_length": 19.828125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.07589285714285714,
"frac_reward_zero_std": 0.25,
"grad_norm": 24.214155197143555,
"learning_rate": 8.005405736415125e-07,
"loss": 0.0,
"num_tokens": 3997160.0,
"reward": 0.3937499523162842,
"reward_std": 0.3061639666557312,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.34375,
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 476.0,
"completions/max_terminated_length": 476.0,
"completions/mean_length": 16.640625,
"completions/mean_terminated_length": 16.640625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.078125,
"frac_reward_zero_std": 0.25,
"grad_norm": 19.65945053100586,
"learning_rate": 7.871643313414718e-07,
"loss": -0.0,
"num_tokens": 4102257.0,
"reward": 0.33124998211860657,
"reward_std": 0.3424546718597412,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.28125,
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 158.0,
"completions/max_terminated_length": 158.0,
"completions/mean_length": 9.828125,
"completions/mean_terminated_length": 9.828125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.08035714285714286,
"frac_reward_zero_std": 0.5,
"grad_norm": 16.9875545501709,
"learning_rate": 7.734740790612136e-07,
"loss": 0.0,
"num_tokens": 4218214.0,
"reward": 0.40625,
"reward_std": 0.22461532056331635,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.2182178944349289,
"rewards/mcq_exact_match_reward/mean": 0.34375,
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 148.0,
"completions/max_terminated_length": 148.0,
"completions/mean_length": 16.703125,
"completions/mean_terminated_length": 16.703125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.08258928571428571,
"frac_reward_zero_std": 0.625,
"grad_norm": 15.114908218383789,
"learning_rate": 7.594847868906076e-07,
"loss": -0.0,
"num_tokens": 4304195.0,
"reward": 0.5093749761581421,
"reward_std": 0.1893727034330368,
"rewards/format_reward/mean": 0.5625,
"rewards/format_reward/std": 0.1666666716337204,
"rewards/mcq_exact_match_reward/mean": 0.453125,
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 16.0,
"completions/max_terminated_length": 16.0,
"completions/mean_length": 6.8125,
"completions/mean_terminated_length": 6.8125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.08482142857142858,
"frac_reward_zero_std": 0.375,
"grad_norm": 16.950824737548828,
"learning_rate": 7.452117519152541e-07,
"loss": -0.0,
"num_tokens": 4430631.0,
"reward": 0.5249999761581421,
"reward_std": 0.29143062233924866,
"rewards/format_reward/mean": 0.5625,
"rewards/format_reward/std": 0.1666666716337204,
"rewards/mcq_exact_match_reward/mean": 0.46875,
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 221.0,
"completions/max_terminated_length": 221.0,
"completions/mean_length": 10.125,
"completions/mean_terminated_length": 10.125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.08705357142857142,
"frac_reward_zero_std": 0.5,
"grad_norm": 25.844154357910156,
"learning_rate": 7.306705814893439e-07,
"loss": -0.0,
"num_tokens": 4522799.0,
"reward": 0.4312499761581421,
"reward_std": 0.2177756428718567,
"rewards/format_reward/mean": 0.5625,
"rewards/format_reward/std": 0.1666666716337204,
"rewards/mcq_exact_match_reward/mean": 0.375,
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 60.0,
"completions/max_terminated_length": 60.0,
"completions/mean_length": 8.203125,
"completions/mean_terminated_length": 8.203125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.08928571428571429,
"frac_reward_zero_std": 0.625,
"grad_norm": 31.361331939697266,
"learning_rate": 7.158771761692464e-07,
"loss": 0.0,
"num_tokens": 4608428.0,
"reward": 0.6703124642372131,
"reward_std": 0.049638569355010986,
"rewards/format_reward/mean": 0.609375,
"rewards/format_reward/std": 0.2083333432674408,
"rewards/mcq_exact_match_reward/mean": 0.609375,
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 8.90625,
"completions/mean_terminated_length": 8.90625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.09151785714285714,
"frac_reward_zero_std": 0.375,
"grad_norm": 17.690139770507812,
"learning_rate": 7.008477123264847e-07,
"loss": 0.0,
"num_tokens": 4691318.0,
"reward": 0.5726562738418579,
"reward_std": 0.2708982527256012,
"rewards/format_reward/mean": 0.7265625,
"rewards/format_reward/std": 0.250866562128067,
"rewards/mcq_exact_match_reward/mean": 0.5,
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 6.09375,
"completions/mean_terminated_length": 6.09375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.09375,
"frac_reward_zero_std": 0.5,
"grad_norm": 18.50206756591797,
"learning_rate": 6.855986244591103e-07,
"loss": -0.0,
"num_tokens": 4789988.0,
"reward": 0.4257812201976776,
"reward_std": 0.2243541181087494,
"rewards/format_reward/mean": 0.5078125,
"rewards/format_reward/std": 0.0625,
"rewards/mcq_exact_match_reward/mean": 0.375,
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 359.0,
"completions/max_terminated_length": 359.0,
"completions/mean_length": 16.03125,
"completions/mean_terminated_length": 16.03125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.09598214285714286,
"frac_reward_zero_std": 0.25,
"grad_norm": 54.797157287597656,
"learning_rate": 6.701465872208216e-07,
"loss": -0.0,
"num_tokens": 4877334.0,
"reward": 0.8117187023162842,
"reward_std": 0.18093490600585938,
"rewards/format_reward/mean": 0.6171875,
"rewards/format_reward/std": 0.21347814798355103,
"rewards/mcq_exact_match_reward/mean": 0.75,
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 7.515625,
"completions/mean_terminated_length": 7.515625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.09821428571428571,
"frac_reward_zero_std": 0.625,
"grad_norm": 12.843533515930176,
"learning_rate": 6.545084971874736e-07,
"loss": 0.0,
"num_tokens": 4983247.0,
"reward": 0.59375,
"reward_std": 0.16675157845020294,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.2182178944349289,
"rewards/mcq_exact_match_reward/mean": 0.53125,
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 8.375,
"completions/mean_terminated_length": 8.375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.10044642857142858,
"frac_reward_zero_std": 0.375,
"grad_norm": 21.998193740844727,
"learning_rate": 6.387014543809223e-07,
"loss": -0.0,
"num_tokens": 5059511.0,
"reward": 0.49140626192092896,
"reward_std": 0.20160752534866333,
"rewards/format_reward/mean": 0.6953125,
"rewards/format_reward/std": 0.24587368965148926,
"rewards/mcq_exact_match_reward/mean": 0.421875,
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 7.546875,
"completions/mean_terminated_length": 7.546875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.10267857142857142,
"frac_reward_zero_std": 0.375,
"grad_norm": 28.897733688354492,
"learning_rate": 6.227427435703995e-07,
"loss": -0.0,
"num_tokens": 5150106.0,
"reward": 0.28125,
"reward_std": 0.28247907757759094,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.2182178944349289,
"rewards/mcq_exact_match_reward/mean": 0.21875,
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 6.109375,
"completions/mean_terminated_length": 6.109375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.10491071428571429,
"frac_reward_zero_std": 0.5,
"grad_norm": 23.489076614379883,
"learning_rate": 6.066498153718734e-07,
"loss": -0.0,
"num_tokens": 5243569.0,
"reward": 0.5351561903953552,
"reward_std": 0.18003448843955994,
"rewards/format_reward/mean": 0.5078125,
"rewards/format_reward/std": 0.0625,
"rewards/mcq_exact_match_reward/mean": 0.484375,
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 16.0,
"completions/max_terminated_length": 16.0,
"completions/mean_length": 9.90625,
"completions/mean_terminated_length": 9.90625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.10714285714285714,
"frac_reward_zero_std": 0.75,
"grad_norm": 7.710181713104248,
"learning_rate": 5.90440267166055e-07,
"loss": 0.0,
"num_tokens": 5309691.0,
"reward": 0.17500001192092896,
"reward_std": 0.10888782143592834,
"rewards/format_reward/mean": 0.8125,
"rewards/format_reward/std": 0.24397502839565277,
"rewards/mcq_exact_match_reward/mean": 0.09375,
"rewards/mcq_exact_match_reward/std": 0.29378482699394226,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 43.0,
"completions/max_terminated_length": 43.0,
"completions/mean_length": 8.8125,
"completions/mean_terminated_length": 8.8125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.109375,
"frac_reward_zero_std": 0.25,
"grad_norm": 21.57369613647461,
"learning_rate": 5.741318238559209e-07,
"loss": 0.0,
"num_tokens": 5411943.0,
"reward": 0.5812499523162842,
"reward_std": 0.2802783250808716,
"rewards/format_reward/mean": 0.65625,
"rewards/format_reward/std": 0.25,
"rewards/mcq_exact_match_reward/mean": 0.515625,
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 572.0,
"completions/max_terminated_length": 572.0,
"completions/mean_length": 20.78125,
"completions/mean_terminated_length": 20.78125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.11160714285714286,
"frac_reward_zero_std": 0.125,
"grad_norm": 31.959693908691406,
"learning_rate": 5.577423184847931e-07,
"loss": 0.0,
"num_tokens": 5512089.0,
"reward": 0.40937498211860657,
"reward_std": 0.20373183488845825,
"rewards/format_reward/mean": 0.65625,
"rewards/format_reward/std": 0.233588308095932,
"rewards/mcq_exact_match_reward/mean": 0.34375,
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
"step": 50
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 5512089,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}