Model: cjiao/goldengoose-divsweep_goose_n512_indorc_tau0.10-7grp Source: Original Platform
1385 lines
50 KiB
JSON
1385 lines
50 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.11160714285714286,
|
|
"eval_steps": 500,
|
|
"global_step": 50,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1786.0,
|
|
"completions/max_terminated_length": 1786.0,
|
|
"completions/mean_length": 541.859375,
|
|
"completions/mean_terminated_length": 541.859375,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.002232142857142857,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.971773147583008,
|
|
"learning_rate": 0.0,
|
|
"loss": -0.0,
|
|
"num_tokens": 112399.0,
|
|
"reward": 0.25312498211860657,
|
|
"reward_std": 0.35192999243736267,
|
|
"rewards/format_reward/mean": 0.34375,
|
|
"rewards/format_reward/std": 0.25,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 1
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1913.0,
|
|
"completions/max_terminated_length": 1913.0,
|
|
"completions/mean_length": 460.625,
|
|
"completions/mean_terminated_length": 460.625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.004464285714285714,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.931671142578125,
|
|
"learning_rate": 2e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 239831.0,
|
|
"reward": 0.33671873807907104,
|
|
"reward_std": 0.35391804575920105,
|
|
"rewards/format_reward/mean": 0.3984375,
|
|
"rewards/format_reward/std": 0.20275264978408813,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 2
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1663.0,
|
|
"completions/max_terminated_length": 1663.0,
|
|
"completions/mean_length": 578.5625,
|
|
"completions/mean_terminated_length": 578.5625,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.006696428571428571,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 2.283196210861206,
|
|
"learning_rate": 4e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 357427.0,
|
|
"reward": 0.28125,
|
|
"reward_std": 0.3595561385154724,
|
|
"rewards/format_reward/mean": 0.46875,
|
|
"rewards/format_reward/std": 0.279951810836792,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 3
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1725.0,
|
|
"completions/mean_length": 508.953125,
|
|
"completions/mean_terminated_length": 484.5238342285156,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.008928571428571428,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 14.620399475097656,
|
|
"learning_rate": 6e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 498696.0,
|
|
"reward": 0.38203126192092896,
|
|
"reward_std": 0.4592018127441406,
|
|
"rewards/format_reward/mean": 0.3828125,
|
|
"rewards/format_reward/std": 0.2634054720401764,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 4
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1176.0,
|
|
"completions/max_terminated_length": 1176.0,
|
|
"completions/mean_length": 443.328125,
|
|
"completions/mean_terminated_length": 443.328125,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.011160714285714286,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 23.866165161132812,
|
|
"learning_rate": 8e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 606045.0,
|
|
"reward": 0.23671872913837433,
|
|
"reward_std": 0.23968853056430817,
|
|
"rewards/format_reward/mean": 0.3359375,
|
|
"rewards/format_reward/std": 0.2824873626232147,
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1733.0,
|
|
"completions/mean_length": 569.5,
|
|
"completions/mean_terminated_length": 521.8064575195312,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.013392857142857142,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 4.394739627838135,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 764053.0,
|
|
"reward": 0.3164062201976776,
|
|
"reward_std": 0.24395309388637543,
|
|
"rewards/format_reward/mean": 0.3515625,
|
|
"rewards/format_reward/std": 0.2302463799715042,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 6
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1963.0,
|
|
"completions/mean_length": 531.71875,
|
|
"completions/mean_terminated_length": 507.65081787109375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.015625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.567261695861816,
|
|
"learning_rate": 9.99726628670463e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 895587.0,
|
|
"reward": 0.2867187261581421,
|
|
"reward_std": 0.34152650833129883,
|
|
"rewards/format_reward/mean": 0.3671875,
|
|
"rewards/format_reward/std": 0.22257846593856812,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 7
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1868.0,
|
|
"completions/max_terminated_length": 1868.0,
|
|
"completions/mean_length": 575.71875,
|
|
"completions/mean_terminated_length": 575.71875,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.017857142857142856,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 5.7189435958862305,
|
|
"learning_rate": 9.989068136093872e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1015801.0,
|
|
"reward": 0.26093748211860657,
|
|
"reward_std": 0.2545996308326721,
|
|
"rewards/format_reward/mean": 0.421875,
|
|
"rewards/format_reward/std": 0.20351573824882507,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 8
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1845.0,
|
|
"completions/mean_length": 584.140625,
|
|
"completions/mean_terminated_length": 560.90478515625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.020089285714285716,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 6.5300421714782715,
|
|
"learning_rate": 9.975414512725056e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1153538.0,
|
|
"reward": 0.3515625,
|
|
"reward_std": 0.40456390380859375,
|
|
"rewards/format_reward/mean": 0.390625,
|
|
"rewards/format_reward/std": 0.24346621334552765,
|
|
"rewards/mcq_exact_match_reward/mean": 0.3125,
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
|
"step": 9
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1695.0,
|
|
"completions/mean_length": 586.046875,
|
|
"completions/mean_terminated_length": 562.84130859375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.022321428571428572,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 4.844789028167725,
|
|
"learning_rate": 9.956320346634875e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1279109.0,
|
|
"reward": 0.31874996423721313,
|
|
"reward_std": 0.30643922090530396,
|
|
"rewards/format_reward/mean": 0.375,
|
|
"rewards/format_reward/std": 0.2182178944349289,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 2034.0,
|
|
"completions/mean_length": 403.140625,
|
|
"completions/mean_terminated_length": 350.08062744140625,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.024553571428571428,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 25.20620346069336,
|
|
"learning_rate": 9.931806517013612e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1407374.0,
|
|
"reward": 0.35468748211860657,
|
|
"reward_std": 0.24326452612876892,
|
|
"rewards/format_reward/mean": 0.421875,
|
|
"rewards/format_reward/std": 0.2221602201461792,
|
|
"rewards/mcq_exact_match_reward/mean": 0.3125,
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
|
"step": 11
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1665.0,
|
|
"completions/mean_length": 540.59375,
|
|
"completions/mean_terminated_length": 491.96771240234375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.026785714285714284,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.811991691589355,
|
|
"learning_rate": 9.901899829374047e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1536228.0,
|
|
"reward": 0.2890624701976776,
|
|
"reward_std": 0.38433414697647095,
|
|
"rewards/format_reward/mean": 0.390625,
|
|
"rewards/format_reward/std": 0.24346621334552765,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 12
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1231.0,
|
|
"completions/mean_length": 437.3125,
|
|
"completions/mean_terminated_length": 385.3548278808594,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.029017857142857144,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 9.110626220703125,
|
|
"learning_rate": 9.866632986240029e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1661912.0,
|
|
"reward": 0.38593748211860657,
|
|
"reward_std": 0.3386583626270294,
|
|
"rewards/format_reward/mean": 0.421875,
|
|
"rewards/format_reward/std": 0.18298126757144928,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 13
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1989.0,
|
|
"completions/max_terminated_length": 1989.0,
|
|
"completions/mean_length": 396.1875,
|
|
"completions/mean_terminated_length": 396.1875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.03125,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 22.122051239013672,
|
|
"learning_rate": 9.826044551386742e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1785052.0,
|
|
"reward": 0.5171874761581421,
|
|
"reward_std": 0.37966397404670715,
|
|
"rewards/format_reward/mean": 0.484375,
|
|
"rewards/format_reward/std": 0.29504841566085815,
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 14
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1319.0,
|
|
"completions/mean_length": 398.1875,
|
|
"completions/mean_terminated_length": 344.9677429199219,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.033482142857142856,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 8.21301555633545,
|
|
"learning_rate": 9.780178907671788e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1906464.0,
|
|
"reward": 0.23906248807907104,
|
|
"reward_std": 0.214445561170578,
|
|
"rewards/format_reward/mean": 0.515625,
|
|
"rewards/format_reward/std": 0.1985812783241272,
|
|
"rewards/mcq_exact_match_reward/mean": 0.1875,
|
|
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
|
|
"step": 15
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.078125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1598.0,
|
|
"completions/mean_length": 539.109375,
|
|
"completions/mean_terminated_length": 411.2372741699219,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.03571428571428571,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 17.956470489501953,
|
|
"learning_rate": 9.729086208503173e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2039351.0,
|
|
"reward": 0.4976562261581421,
|
|
"reward_std": 0.4534168541431427,
|
|
"rewards/format_reward/mean": 0.4453125,
|
|
"rewards/format_reward/std": 0.26899561285972595,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 16
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1142.0,
|
|
"completions/mean_length": 333.8125,
|
|
"completions/mean_terminated_length": 306.6031799316406,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.03794642857142857,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 19.72380256652832,
|
|
"learning_rate": 9.672822322997304e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2139947.0,
|
|
"reward": 0.3046875,
|
|
"reward_std": 0.3565414547920227,
|
|
"rewards/format_reward/mean": 0.546875,
|
|
"rewards/format_reward/std": 0.2916666865348816,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 17
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1142.0,
|
|
"completions/mean_length": 248.0625,
|
|
"completions/mean_terminated_length": 219.49208068847656,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"epoch": 0.04017857142857143,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 27.903902053833008,
|
|
"learning_rate": 9.611448774886923e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2281071.0,
|
|
"reward": 0.18984374403953552,
|
|
"reward_std": 0.21500104665756226,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.18881812691688538,
|
|
"rewards/mcq_exact_match_reward/mean": 0.140625,
|
|
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
|
|
"step": 18
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1478.0,
|
|
"completions/mean_length": 237.8125,
|
|
"completions/mean_terminated_length": 209.07937622070312,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.04241071428571429,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 23.812612533569336,
|
|
"learning_rate": 9.545032675245813e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2410507.0,
|
|
"reward": 0.3968749940395355,
|
|
"reward_std": 0.37050747871398926,
|
|
"rewards/format_reward/mean": 0.53125,
|
|
"rewards/format_reward/std": 0.21593283116817474,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 19
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 832.0,
|
|
"completions/max_terminated_length": 832.0,
|
|
"completions/mean_length": 102.78125,
|
|
"completions/mean_terminated_length": 102.78125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.044642857142857144,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 17.30659294128418,
|
|
"learning_rate": 9.473646649103817e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2492773.0,
|
|
"reward": 0.5757812261581421,
|
|
"reward_std": 0.4401569366455078,
|
|
"rewards/format_reward/mean": 0.6015625,
|
|
"rewards/format_reward/std": 0.28423789143562317,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 20
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 685.0,
|
|
"completions/mean_length": 71.796875,
|
|
"completions/mean_terminated_length": 40.42857360839844,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.046875,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 10.079203605651855,
|
|
"learning_rate": 9.397368756032444e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2624568.0,
|
|
"reward": 0.10312500596046448,
|
|
"reward_std": 0.10811922699213028,
|
|
"rewards/format_reward/mean": 0.5625,
|
|
"rewards/format_reward/std": 0.22712838649749756,
|
|
"rewards/mcq_exact_match_reward/mean": 0.046875,
|
|
"rewards/mcq_exact_match_reward/std": 0.21304203569889069,
|
|
"step": 21
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1416.0,
|
|
"completions/max_terminated_length": 1416.0,
|
|
"completions/mean_length": 105.59375,
|
|
"completions/mean_terminated_length": 105.59375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.049107142857142856,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 15.665273666381836,
|
|
"learning_rate": 9.316282404787869e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2746590.0,
|
|
"reward": 0.3062500059604645,
|
|
"reward_std": 0.26257947087287903,
|
|
"rewards/format_reward/mean": 0.5625,
|
|
"rewards/format_reward/std": 0.18898223340511322,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 22
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1835.0,
|
|
"completions/mean_length": 111.84375,
|
|
"completions/mean_terminated_length": 81.11111450195312,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.05133928571428571,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 17.90364646911621,
|
|
"learning_rate": 9.230476262104676e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2852316.0,
|
|
"reward": 0.26093751192092896,
|
|
"reward_std": 0.11917313188314438,
|
|
"rewards/format_reward/mean": 0.578125,
|
|
"rewards/format_reward/std": 0.2221602201461792,
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 23
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 721.0,
|
|
"completions/max_terminated_length": 721.0,
|
|
"completions/mean_length": 59.4375,
|
|
"completions/mean_terminated_length": 59.4375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.05357142857142857,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 23.127439498901367,
|
|
"learning_rate": 9.1400441557401e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2948880.0,
|
|
"reward": 0.4039062261581421,
|
|
"reward_std": 0.3720070719718933,
|
|
"rewards/format_reward/mean": 0.6015625,
|
|
"rewards/format_reward/std": 0.28423789143562317,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 24
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 473.0,
|
|
"completions/max_terminated_length": 473.0,
|
|
"completions/mean_length": 38.484375,
|
|
"completions/mean_terminated_length": 38.484375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.05580357142857143,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 23.131183624267578,
|
|
"learning_rate": 9.045084971874737e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3042983.0,
|
|
"reward": 0.49609375,
|
|
"reward_std": 0.39612793922424316,
|
|
"rewards/format_reward/mean": 0.5859375,
|
|
"rewards/format_reward/std": 0.244862899184227,
|
|
"rewards/mcq_exact_match_reward/mean": 0.4375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5,
|
|
"step": 25
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 396.0,
|
|
"completions/max_terminated_length": 396.0,
|
|
"completions/mean_length": 14.578125,
|
|
"completions/mean_terminated_length": 14.578125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.05803571428571429,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 21.188722610473633,
|
|
"learning_rate": 8.945702546981968e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3153444.0,
|
|
"reward": 0.26874998211860657,
|
|
"reward_std": 0.29173365235328674,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.08908708393573761,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 26
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 19.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 8.234375,
|
|
"completions/mean_terminated_length": 8.234375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.060267857142857144,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 31.691007614135742,
|
|
"learning_rate": 8.842005554284295e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3245059.0,
|
|
"reward": 0.3148437738418579,
|
|
"reward_std": 0.4270421266555786,
|
|
"rewards/format_reward/mean": 0.6484375,
|
|
"rewards/format_reward/std": 0.2302463799715042,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 27
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 910.0,
|
|
"completions/max_terminated_length": 910.0,
|
|
"completions/mean_length": 23.265625,
|
|
"completions/mean_terminated_length": 23.265625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0625,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 24.359390258789062,
|
|
"learning_rate": 8.734107384920769e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3361140.0,
|
|
"reward": 0.40156251192092896,
|
|
"reward_std": 0.28653684258461,
|
|
"rewards/format_reward/mean": 0.578125,
|
|
"rewards/format_reward/std": 0.18298126757144928,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 28
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 237.0,
|
|
"completions/max_terminated_length": 237.0,
|
|
"completions/mean_length": 9.609375,
|
|
"completions/mean_terminated_length": 9.609375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.06473214285714286,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 17.99944305419922,
|
|
"learning_rate": 8.622126023955445e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3490331.0,
|
|
"reward": 0.20546874403953552,
|
|
"reward_std": 0.17656923830509186,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.15625,
|
|
"rewards/mcq_exact_match_reward/std": 0.36596253514289856,
|
|
"step": 29
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 20.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 6.625,
|
|
"completions/mean_terminated_length": 6.625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.06696428571428571,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 20.938568115234375,
|
|
"learning_rate": 8.506183921362442e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3605843.0,
|
|
"reward": 0.2874999940395355,
|
|
"reward_std": 0.11888101696968079,
|
|
"rewards/format_reward/mean": 0.53125,
|
|
"rewards/format_reward/std": 0.12198751419782639,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 30
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 33.0,
|
|
"completions/max_terminated_length": 33.0,
|
|
"completions/mean_length": 6.421875,
|
|
"completions/mean_terminated_length": 6.421875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.06919642857142858,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 20.33233070373535,
|
|
"learning_rate": 8.386407858128706e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3710350.0,
|
|
"reward": 0.44062498211860657,
|
|
"reward_std": 0.210945725440979,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 31
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 6.109375,
|
|
"completions/mean_terminated_length": 6.109375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.07142857142857142,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 26.781448364257812,
|
|
"learning_rate": 8.262928807620843e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3807701.0,
|
|
"reward": 0.2984374761581421,
|
|
"reward_std": 0.16964475810527802,
|
|
"rewards/format_reward/mean": 0.484375,
|
|
"rewards/format_reward/std": 0.08768405020236969,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 32
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 6.84375,
|
|
"completions/mean_terminated_length": 6.84375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.07366071428571429,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 22.09980583190918,
|
|
"learning_rate": 8.135881792367685e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3908627.0,
|
|
"reward": 0.39921873807907104,
|
|
"reward_std": 0.1689612716436386,
|
|
"rewards/format_reward/mean": 0.5546875,
|
|
"rewards/format_reward/std": 0.180765300989151,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 33
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 568.0,
|
|
"completions/max_terminated_length": 568.0,
|
|
"completions/mean_length": 19.828125,
|
|
"completions/mean_terminated_length": 19.828125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.07589285714285714,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 24.214155197143555,
|
|
"learning_rate": 8.005405736415125e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3997160.0,
|
|
"reward": 0.3937499523162842,
|
|
"reward_std": 0.3061639666557312,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 34
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 476.0,
|
|
"completions/max_terminated_length": 476.0,
|
|
"completions/mean_length": 16.640625,
|
|
"completions/mean_terminated_length": 16.640625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.078125,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 19.65945053100586,
|
|
"learning_rate": 7.871643313414718e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4102257.0,
|
|
"reward": 0.33124998211860657,
|
|
"reward_std": 0.3424546718597412,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 35
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 158.0,
|
|
"completions/max_terminated_length": 158.0,
|
|
"completions/mean_length": 9.828125,
|
|
"completions/mean_terminated_length": 9.828125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.08035714285714286,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 16.9875545501709,
|
|
"learning_rate": 7.734740790612136e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4218214.0,
|
|
"reward": 0.40625,
|
|
"reward_std": 0.22461532056331635,
|
|
"rewards/format_reward/mean": 0.625,
|
|
"rewards/format_reward/std": 0.2182178944349289,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 36
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 148.0,
|
|
"completions/max_terminated_length": 148.0,
|
|
"completions/mean_length": 16.703125,
|
|
"completions/mean_terminated_length": 16.703125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.08258928571428571,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 15.114908218383789,
|
|
"learning_rate": 7.594847868906076e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4304195.0,
|
|
"reward": 0.5093749761581421,
|
|
"reward_std": 0.1893727034330368,
|
|
"rewards/format_reward/mean": 0.5625,
|
|
"rewards/format_reward/std": 0.1666666716337204,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 37
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 6.8125,
|
|
"completions/mean_terminated_length": 6.8125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.08482142857142858,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 16.950824737548828,
|
|
"learning_rate": 7.452117519152541e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4430631.0,
|
|
"reward": 0.5249999761581421,
|
|
"reward_std": 0.29143062233924866,
|
|
"rewards/format_reward/mean": 0.5625,
|
|
"rewards/format_reward/std": 0.1666666716337204,
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 38
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 221.0,
|
|
"completions/max_terminated_length": 221.0,
|
|
"completions/mean_length": 10.125,
|
|
"completions/mean_terminated_length": 10.125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.08705357142857142,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 25.844154357910156,
|
|
"learning_rate": 7.306705814893439e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4522799.0,
|
|
"reward": 0.4312499761581421,
|
|
"reward_std": 0.2177756428718567,
|
|
"rewards/format_reward/mean": 0.5625,
|
|
"rewards/format_reward/std": 0.1666666716337204,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 39
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 60.0,
|
|
"completions/max_terminated_length": 60.0,
|
|
"completions/mean_length": 8.203125,
|
|
"completions/mean_terminated_length": 8.203125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.08928571428571429,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 31.361331939697266,
|
|
"learning_rate": 7.158771761692464e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4608428.0,
|
|
"reward": 0.6703124642372131,
|
|
"reward_std": 0.049638569355010986,
|
|
"rewards/format_reward/mean": 0.609375,
|
|
"rewards/format_reward/std": 0.2083333432674408,
|
|
"rewards/mcq_exact_match_reward/mean": 0.609375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 40
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 8.90625,
|
|
"completions/mean_terminated_length": 8.90625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.09151785714285714,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 17.690139770507812,
|
|
"learning_rate": 7.008477123264847e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4691318.0,
|
|
"reward": 0.5726562738418579,
|
|
"reward_std": 0.2708982527256012,
|
|
"rewards/format_reward/mean": 0.7265625,
|
|
"rewards/format_reward/std": 0.250866562128067,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 41
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 12.0,
|
|
"completions/max_terminated_length": 12.0,
|
|
"completions/mean_length": 6.09375,
|
|
"completions/mean_terminated_length": 6.09375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.09375,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 18.50206756591797,
|
|
"learning_rate": 6.855986244591103e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4789988.0,
|
|
"reward": 0.4257812201976776,
|
|
"reward_std": 0.2243541181087494,
|
|
"rewards/format_reward/mean": 0.5078125,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 42
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 359.0,
|
|
"completions/max_terminated_length": 359.0,
|
|
"completions/mean_length": 16.03125,
|
|
"completions/mean_terminated_length": 16.03125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.09598214285714286,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 54.797157287597656,
|
|
"learning_rate": 6.701465872208216e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4877334.0,
|
|
"reward": 0.8117187023162842,
|
|
"reward_std": 0.18093490600585938,
|
|
"rewards/format_reward/mean": 0.6171875,
|
|
"rewards/format_reward/std": 0.21347814798355103,
|
|
"rewards/mcq_exact_match_reward/mean": 0.75,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 43
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 7.515625,
|
|
"completions/mean_terminated_length": 7.515625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.09821428571428571,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 12.843533515930176,
|
|
"learning_rate": 6.545084971874736e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4983247.0,
|
|
"reward": 0.59375,
|
|
"reward_std": 0.16675157845020294,
|
|
"rewards/format_reward/mean": 0.625,
|
|
"rewards/format_reward/std": 0.2182178944349289,
|
|
"rewards/mcq_exact_match_reward/mean": 0.53125,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 44
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 8.375,
|
|
"completions/mean_terminated_length": 8.375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.10044642857142858,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 21.998193740844727,
|
|
"learning_rate": 6.387014543809223e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5059511.0,
|
|
"reward": 0.49140626192092896,
|
|
"reward_std": 0.20160752534866333,
|
|
"rewards/format_reward/mean": 0.6953125,
|
|
"rewards/format_reward/std": 0.24587368965148926,
|
|
"rewards/mcq_exact_match_reward/mean": 0.421875,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 45
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 7.546875,
|
|
"completions/mean_terminated_length": 7.546875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.10267857142857142,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 28.897733688354492,
|
|
"learning_rate": 6.227427435703995e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5150106.0,
|
|
"reward": 0.28125,
|
|
"reward_std": 0.28247907757759094,
|
|
"rewards/format_reward/mean": 0.625,
|
|
"rewards/format_reward/std": 0.2182178944349289,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 46
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 13.0,
|
|
"completions/max_terminated_length": 13.0,
|
|
"completions/mean_length": 6.109375,
|
|
"completions/mean_terminated_length": 6.109375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.10491071428571429,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 23.489076614379883,
|
|
"learning_rate": 6.066498153718734e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5243569.0,
|
|
"reward": 0.5351561903953552,
|
|
"reward_std": 0.18003448843955994,
|
|
"rewards/format_reward/mean": 0.5078125,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.484375,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 47
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 16.0,
|
|
"completions/max_terminated_length": 16.0,
|
|
"completions/mean_length": 9.90625,
|
|
"completions/mean_terminated_length": 9.90625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.10714285714285714,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 7.710181713104248,
|
|
"learning_rate": 5.90440267166055e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5309691.0,
|
|
"reward": 0.17500001192092896,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/format_reward/mean": 0.8125,
|
|
"rewards/format_reward/std": 0.24397502839565277,
|
|
"rewards/mcq_exact_match_reward/mean": 0.09375,
|
|
"rewards/mcq_exact_match_reward/std": 0.29378482699394226,
|
|
"step": 48
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 43.0,
|
|
"completions/max_terminated_length": 43.0,
|
|
"completions/mean_length": 8.8125,
|
|
"completions/mean_terminated_length": 8.8125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.109375,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 21.57369613647461,
|
|
"learning_rate": 5.741318238559209e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5411943.0,
|
|
"reward": 0.5812499523162842,
|
|
"reward_std": 0.2802783250808716,
|
|
"rewards/format_reward/mean": 0.65625,
|
|
"rewards/format_reward/std": 0.25,
|
|
"rewards/mcq_exact_match_reward/mean": 0.515625,
|
|
"rewards/mcq_exact_match_reward/std": 0.5037065148353577,
|
|
"step": 49
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 572.0,
|
|
"completions/max_terminated_length": 572.0,
|
|
"completions/mean_length": 20.78125,
|
|
"completions/mean_terminated_length": 20.78125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.11160714285714286,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 31.959693908691406,
|
|
"learning_rate": 5.577423184847931e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5512089.0,
|
|
"reward": 0.40937498211860657,
|
|
"reward_std": 0.20373183488845825,
|
|
"rewards/format_reward/mean": 0.65625,
|
|
"rewards/format_reward/std": 0.233588308095932,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 50
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 100,
|
|
"num_input_tokens_seen": 5512089,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 50,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|