Files
goldengoose-divsweep_goose_…/checkpoint-50/trainer_state.json
ModelHub XC 0e4cd1c528 初始化项目,由ModelHub XC社区提供模型
Model: cjiao/goldengoose-divsweep_goose_n512_indorc_tau1.00-7grp
Source: Original Platform
2026-06-27 20:10:04 +08:00

1385 lines
49 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.11160714285714286,
"eval_steps": 500,
"global_step": 50,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1386.0,
"completions/mean_length": 515.03125,
"completions/mean_terminated_length": 490.6984558105469,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.002232142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.372617721557617,
"learning_rate": 0.0,
"loss": 0.0,
"num_tokens": 116874.0,
"reward": 0.296875,
"reward_std": 0.36100417375564575,
"rewards/format_reward/mean": 0.3125,
"rewards/format_reward/std": 0.24397502839565277,
"rewards/mcq_exact_match_reward/mean": 0.265625,
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1737.0,
"completions/max_terminated_length": 1737.0,
"completions/mean_length": 464.234375,
"completions/mean_terminated_length": 464.234375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.004464285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.42634105682373,
"learning_rate": 2e-07,
"loss": -0.0,
"num_tokens": 230473.0,
"reward": 0.33281248807907104,
"reward_std": 0.35629093647003174,
"rewards/format_reward/mean": 0.359375,
"rewards/format_reward/std": 0.28824523091316223,
"rewards/mcq_exact_match_reward/mean": 0.296875,
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1479.0,
"completions/max_terminated_length": 1479.0,
"completions/mean_length": 375.984375,
"completions/mean_terminated_length": 375.984375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.006696428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.815962791442871,
"learning_rate": 4e-07,
"loss": 0.0,
"num_tokens": 346168.0,
"reward": 0.21718749403953552,
"reward_std": 0.3329676389694214,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.2630521357059479,
"rewards/mcq_exact_match_reward/mean": 0.1875,
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1428.0,
"completions/mean_length": 490.515625,
"completions/mean_terminated_length": 386.683349609375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.008928571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.687461853027344,
"learning_rate": 6e-07,
"loss": 0.0,
"num_tokens": 486377.0,
"reward": 0.2593749761581421,
"reward_std": 0.3835652470588684,
"rewards/format_reward/mean": 0.40625,
"rewards/format_reward/std": 0.25,
"rewards/mcq_exact_match_reward/mean": 0.21875,
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1506.0,
"completions/mean_length": 606.28125,
"completions/mean_terminated_length": 535.3770141601562,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.011160714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 16.5729923248291,
"learning_rate": 8e-07,
"loss": 0.0,
"num_tokens": 620971.0,
"reward": 0.23984374105930328,
"reward_std": 0.3459582030773163,
"rewards/format_reward/mean": 0.3671875,
"rewards/format_reward/std": 0.23974503576755524,
"rewards/mcq_exact_match_reward/mean": 0.203125,
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1333.0,
"completions/mean_length": 531.765625,
"completions/mean_terminated_length": 482.8548278808594,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.013392857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.315047264099121,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 739660.0,
"reward": 0.28203123807907104,
"reward_std": 0.23655115067958832,
"rewards/format_reward/mean": 0.3203125,
"rewards/format_reward/std": 0.27265870571136475,
"rewards/mcq_exact_match_reward/mean": 0.25,
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1883.0,
"completions/max_terminated_length": 1883.0,
"completions/mean_length": 534.65625,
"completions/mean_terminated_length": 534.65625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.015625,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.712610244750977,
"learning_rate": 9.99726628670463e-07,
"loss": -0.0,
"num_tokens": 868742.0,
"reward": 0.3554687201976776,
"reward_std": 0.20643417537212372,
"rewards/format_reward/mean": 0.4296875,
"rewards/format_reward/std": 0.23345555365085602,
"rewards/mcq_exact_match_reward/mean": 0.3125,
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1883.0,
"completions/mean_length": 639.4375,
"completions/mean_terminated_length": 570.1638793945312,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.017857142857142856,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.94524097442627,
"learning_rate": 9.989068136093872e-07,
"loss": 0.0,
"num_tokens": 1012874.0,
"reward": 0.20781248807907104,
"reward_std": 0.30935126543045044,
"rewards/format_reward/mean": 0.359375,
"rewards/format_reward/std": 0.2741328477859497,
"rewards/mcq_exact_match_reward/mean": 0.171875,
"rewards/mcq_exact_match_reward/std": 0.38025420904159546,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1730.0,
"completions/max_terminated_length": 1730.0,
"completions/mean_length": 345.8125,
"completions/mean_terminated_length": 345.8125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.020089285714285716,
"frac_reward_zero_std": 0.25,
"grad_norm": 10.861742973327637,
"learning_rate": 9.975414512725056e-07,
"loss": -0.0,
"num_tokens": 1141086.0,
"reward": 0.2593749761581421,
"reward_std": 0.2899813652038574,
"rewards/format_reward/mean": 0.40625,
"rewards/format_reward/std": 0.21593283116817474,
"rewards/mcq_exact_match_reward/mean": 0.21875,
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1430.0,
"completions/max_terminated_length": 1430.0,
"completions/mean_length": 466.390625,
"completions/mean_terminated_length": 466.390625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.022321428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.960936546325684,
"learning_rate": 9.956320346634875e-07,
"loss": 0.0,
"num_tokens": 1260815.0,
"reward": 0.27421873807907104,
"reward_std": 0.3824688494205475,
"rewards/format_reward/mean": 0.3984375,
"rewards/format_reward/std": 0.28423789143562317,
"rewards/mcq_exact_match_reward/mean": 0.234375,
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1601.0,
"completions/mean_length": 428.53125,
"completions/mean_terminated_length": 376.2903137207031,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.024553571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 18.18084716796875,
"learning_rate": 9.931806517013612e-07,
"loss": 0.0,
"num_tokens": 1379769.0,
"reward": 0.23281249403953552,
"reward_std": 0.2880294919013977,
"rewards/format_reward/mean": 0.453125,
"rewards/format_reward/std": 0.2630521357059479,
"rewards/mcq_exact_match_reward/mean": 0.1875,
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 680.0,
"completions/max_terminated_length": 680.0,
"completions/mean_length": 135.90625,
"completions/mean_terminated_length": 135.90625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.026785714285714284,
"frac_reward_zero_std": 0.375,
"grad_norm": 17.66448402404785,
"learning_rate": 9.901899829374047e-07,
"loss": -0.0,
"num_tokens": 1470971.0,
"reward": 0.4703124761581421,
"reward_std": 0.28840553760528564,
"rewards/format_reward/mean": 0.484375,
"rewards/format_reward/std": 0.17747680842876434,
"rewards/mcq_exact_match_reward/mean": 0.421875,
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1568.0,
"completions/max_terminated_length": 1568.0,
"completions/mean_length": 217.578125,
"completions/mean_terminated_length": 217.578125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.029017857142857144,
"frac_reward_zero_std": 0.625,
"grad_norm": 10.263340950012207,
"learning_rate": 9.866632986240029e-07,
"loss": 0.0,
"num_tokens": 1578096.0,
"reward": 0.2523437440395355,
"reward_std": 0.11321917921304703,
"rewards/format_reward/mean": 0.4921875,
"rewards/format_reward/std": 0.0625,
"rewards/mcq_exact_match_reward/mean": 0.203125,
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 842.0,
"completions/max_terminated_length": 842.0,
"completions/mean_length": 99.265625,
"completions/mean_terminated_length": 99.265625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.03125,
"frac_reward_zero_std": 0.125,
"grad_norm": 18.594722747802734,
"learning_rate": 9.826044551386742e-07,
"loss": 0.0,
"num_tokens": 1670537.0,
"reward": 0.42812496423721313,
"reward_std": 0.3584539592266083,
"rewards/format_reward/mean": 0.53125,
"rewards/format_reward/std": 0.17536810040473938,
"rewards/mcq_exact_match_reward/mean": 0.375,
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 533.0,
"completions/max_terminated_length": 533.0,
"completions/mean_length": 27.75,
"completions/mean_terminated_length": 27.75,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.033482142857142856,
"frac_reward_zero_std": 0.75,
"grad_norm": 10.696029663085938,
"learning_rate": 9.780178907671788e-07,
"loss": -0.0,
"num_tokens": 1772369.0,
"reward": 0.5031249523162842,
"reward_std": 0.11100947111845016,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.453125,
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.03571428571428571,
"frac_reward_zero_std": 0.75,
"grad_norm": 16.516544342041016,
"learning_rate": 9.729086208503173e-07,
"loss": -0.0,
"num_tokens": 1875425.0,
"reward": 0.40937498211860657,
"reward_std": 0.12255740165710449,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.359375,
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.03794642857142857,
"frac_reward_zero_std": 0.375,
"grad_norm": 21.267547607421875,
"learning_rate": 9.672822322997304e-07,
"loss": 0.0,
"num_tokens": 1988737.0,
"reward": 0.28437498211860657,
"reward_std": 0.26196980476379395,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.234375,
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.04017857142857143,
"frac_reward_zero_std": 0.625,
"grad_norm": 20.922260284423828,
"learning_rate": 9.611448774886923e-07,
"loss": -0.0,
"num_tokens": 2100009.0,
"reward": 0.7687499523162842,
"reward_std": 0.1872510462999344,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.71875,
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 59.0,
"completions/max_terminated_length": 59.0,
"completions/mean_length": 6.828125,
"completions/mean_terminated_length": 6.828125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.04241071428571429,
"frac_reward_zero_std": 0.5,
"grad_norm": 22.29154396057129,
"learning_rate": 9.545032675245813e-07,
"loss": -0.0,
"num_tokens": 2228990.0,
"reward": 0.39374998211860657,
"reward_std": 0.23356686532497406,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.34375,
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 40.0,
"completions/mean_length": 6.53125,
"completions/mean_terminated_length": 6.53125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.044642857142857144,
"frac_reward_zero_std": 0.5,
"grad_norm": 17.501502990722656,
"learning_rate": 9.473646649103817e-07,
"loss": 0.0,
"num_tokens": 2338600.0,
"reward": 0.4093749523162842,
"reward_std": 0.19044625759124756,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.359375,
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 7.3125,
"completions/mean_terminated_length": 7.3125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.046875,
"frac_reward_zero_std": 0.75,
"grad_norm": 14.592903137207031,
"learning_rate": 9.397368756032444e-07,
"loss": 0.0,
"num_tokens": 2423444.0,
"reward": 0.19999998807907104,
"reward_std": 0.06365012377500534,
"rewards/format_reward/mean": 0.4375,
"rewards/format_reward/std": 0.1666666716337204,
"rewards/mcq_exact_match_reward/mean": 0.15625,
"rewards/mcq_exact_match_reward/std": 0.36596253514289856,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.049107142857142856,
"frac_reward_zero_std": 0.625,
"grad_norm": 15.976240158081055,
"learning_rate": 9.316282404787869e-07,
"loss": -0.0,
"num_tokens": 2520124.0,
"reward": 0.31562498211860657,
"reward_std": 0.17782476544380188,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.265625,
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 540.0,
"completions/max_terminated_length": 540.0,
"completions/mean_length": 17.328125,
"completions/mean_terminated_length": 17.328125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.05133928571428571,
"frac_reward_zero_std": 0.625,
"grad_norm": 14.27862548828125,
"learning_rate": 9.230476262104676e-07,
"loss": -0.0,
"num_tokens": 2650241.0,
"reward": 0.5812499523162842,
"reward_std": 0.15520364046096802,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.53125,
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11.0,
"completions/max_terminated_length": 11.0,
"completions/mean_length": 6.078125,
"completions/mean_terminated_length": 6.078125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.05357142857142857,
"frac_reward_zero_std": 0.5,
"grad_norm": 15.152713775634766,
"learning_rate": 9.1400441557401e-07,
"loss": -0.0,
"num_tokens": 2770894.0,
"reward": 0.19062498211860657,
"reward_std": 0.2198972851037979,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.140625,
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 211.0,
"completions/max_terminated_length": 211.0,
"completions/mean_length": 9.203125,
"completions/mean_terminated_length": 9.203125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.05580357142857143,
"frac_reward_zero_std": 0.625,
"grad_norm": 29.53569984436035,
"learning_rate": 9.045084971874737e-07,
"loss": -0.0,
"num_tokens": 2841411.0,
"reward": 0.4406249523162842,
"reward_std": 0.17782476544380188,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.390625,
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.05803571428571429,
"frac_reward_zero_std": 0.625,
"grad_norm": 23.521568298339844,
"learning_rate": 8.945702546981968e-07,
"loss": -0.0,
"num_tokens": 2926323.0,
"reward": 0.5031249523162842,
"reward_std": 0.15992167592048645,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.453125,
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.060267857142857144,
"frac_reward_zero_std": 0.625,
"grad_norm": 17.729854583740234,
"learning_rate": 8.842005554284295e-07,
"loss": -0.0,
"num_tokens": 3047355.0,
"reward": 0.4093749523162842,
"reward_std": 0.15992167592048645,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.359375,
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0625,
"frac_reward_zero_std": 0.75,
"grad_norm": 10.808194160461426,
"learning_rate": 8.734107384920769e-07,
"loss": 0.0,
"num_tokens": 3145515.0,
"reward": 0.29999998211860657,
"reward_std": 0.0883883386850357,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.25,
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.06473214285714286,
"frac_reward_zero_std": 0.625,
"grad_norm": 18.28078269958496,
"learning_rate": 8.622126023955445e-07,
"loss": -0.0,
"num_tokens": 3238355.0,
"reward": 0.28437498211860657,
"reward_std": 0.16887322068214417,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.234375,
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 650.0,
"completions/max_terminated_length": 650.0,
"completions/mean_length": 16.0625,
"completions/mean_terminated_length": 16.0625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.06696428571428571,
"frac_reward_zero_std": 0.75,
"grad_norm": 12.452601432800293,
"learning_rate": 8.506183921362442e-07,
"loss": -0.0,
"num_tokens": 3350191.0,
"reward": 0.3781249523162842,
"reward_std": 0.10205792635679245,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.328125,
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.06919642857142858,
"frac_reward_zero_std": 0.5,
"grad_norm": 32.80612564086914,
"learning_rate": 8.386407858128706e-07,
"loss": -0.0,
"num_tokens": 3443559.0,
"reward": 0.33124998211860657,
"reward_std": 0.21306739747524261,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.28125,
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 183.0,
"completions/max_terminated_length": 183.0,
"completions/mean_length": 10.84375,
"completions/mean_terminated_length": 10.84375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.07142857142857142,
"frac_reward_zero_std": 0.5,
"grad_norm": 21.804208755493164,
"learning_rate": 8.262928807620843e-07,
"loss": 0.0,
"num_tokens": 3553701.0,
"reward": 0.40937498211860657,
"reward_std": 0.2109457403421402,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.359375,
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.07366071428571429,
"frac_reward_zero_std": 0.625,
"grad_norm": 15.285557746887207,
"learning_rate": 8.135881792367685e-07,
"loss": 0.0,
"num_tokens": 3658485.0,
"reward": 0.40937498211860657,
"reward_std": 0.1530819833278656,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.359375,
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.07589285714285714,
"frac_reward_zero_std": 0.75,
"grad_norm": 19.146930694580078,
"learning_rate": 8.005405736415125e-07,
"loss": -0.0,
"num_tokens": 3753581.0,
"reward": 0.5031249523162842,
"reward_std": 0.11100947111845016,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.453125,
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.078125,
"frac_reward_zero_std": 0.875,
"grad_norm": 12.522613525390625,
"learning_rate": 7.871643313414718e-07,
"loss": -0.0,
"num_tokens": 3851317.0,
"reward": 0.4562499523162842,
"reward_std": 0.0578637532889843,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.40625,
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 29.0,
"completions/max_terminated_length": 29.0,
"completions/mean_length": 6.578125,
"completions/mean_terminated_length": 6.578125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.08035714285714286,
"frac_reward_zero_std": 0.375,
"grad_norm": 38.662662506103516,
"learning_rate": 7.734740790612136e-07,
"loss": 0.0,
"num_tokens": 3935226.0,
"reward": 0.34609371423721313,
"reward_std": 0.20160752534866333,
"rewards/format_reward/mean": 0.4921875,
"rewards/format_reward/std": 0.0625,
"rewards/mcq_exact_match_reward/mean": 0.296875,
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.08258928571428571,
"frac_reward_zero_std": 0.75,
"grad_norm": 32.49333572387695,
"learning_rate": 7.594847868906076e-07,
"loss": -0.0,
"num_tokens": 4054722.0,
"reward": 0.5968749523162842,
"reward_std": 0.11100947856903076,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.546875,
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.08482142857142858,
"frac_reward_zero_std": 0.625,
"grad_norm": 38.19132614135742,
"learning_rate": 7.452117519152541e-07,
"loss": 0.0,
"num_tokens": 4151506.0,
"reward": 0.7164062261581421,
"reward_std": 0.15529169142246246,
"rewards/format_reward/mean": 0.4453125,
"rewards/format_reward/std": 0.15728822350502014,
"rewards/mcq_exact_match_reward/mean": 0.671875,
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.08705357142857142,
"frac_reward_zero_std": 0.5,
"grad_norm": 30.554147720336914,
"learning_rate": 7.306705814893439e-07,
"loss": 0.0,
"num_tokens": 4230210.0,
"reward": 0.3937499523162842,
"reward_std": 0.19727614521980286,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.34375,
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.08928571428571429,
"frac_reward_zero_std": 0.625,
"grad_norm": 23.661088943481445,
"learning_rate": 7.158771761692464e-07,
"loss": -0.0,
"num_tokens": 4335138.0,
"reward": 0.34609371423721313,
"reward_std": 0.15529169142246246,
"rewards/format_reward/mean": 0.4921875,
"rewards/format_reward/std": 0.0625,
"rewards/mcq_exact_match_reward/mean": 0.296875,
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.09151785714285714,
"frac_reward_zero_std": 0.625,
"grad_norm": 32.43943786621094,
"learning_rate": 7.008477123264847e-07,
"loss": 0.0,
"num_tokens": 4439562.0,
"reward": 0.23749998211860657,
"reward_std": 0.16675156354904175,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.1875,
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.09375,
"frac_reward_zero_std": 0.625,
"grad_norm": 13.679961204528809,
"learning_rate": 6.855986244591103e-07,
"loss": -0.0,
"num_tokens": 4555042.0,
"reward": 0.5031249523162842,
"reward_std": 0.13258251547813416,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.453125,
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.09598214285714286,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.701465872208216e-07,
"loss": 0.0,
"num_tokens": 4661210.0,
"reward": 0.4249999523162842,
"reward_std": 0.0,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.375,
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 125.0,
"completions/max_terminated_length": 125.0,
"completions/mean_length": 7.859375,
"completions/mean_terminated_length": 7.859375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.09821428571428571,
"frac_reward_zero_std": 0.875,
"grad_norm": 6.7772626876831055,
"learning_rate": 6.545084971874736e-07,
"loss": -0.0,
"num_tokens": 4763577.0,
"reward": 0.4093749523162842,
"reward_std": 0.04419417306780815,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.359375,
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.10044642857142858,
"frac_reward_zero_std": 0.625,
"grad_norm": 29.50361442565918,
"learning_rate": 6.387014543809223e-07,
"loss": 0.0,
"num_tokens": 4850985.0,
"reward": 0.5499999523162842,
"reward_std": 0.16675156354904175,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.5,
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.10267857142857142,
"frac_reward_zero_std": 0.875,
"grad_norm": 14.329254150390625,
"learning_rate": 6.227427435703995e-07,
"loss": -0.0,
"num_tokens": 4946729.0,
"reward": 0.5187499523162842,
"reward_std": 0.0578637532889843,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.46875,
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.10491071428571429,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.066498153718734e-07,
"loss": 0.0,
"num_tokens": 5027633.0,
"reward": 0.6749999523162842,
"reward_std": 0.0,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.625,
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.10714285714285714,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 5.90440267166055e-07,
"loss": 0.0,
"num_tokens": 5123361.0,
"reward": 0.5499999523162842,
"reward_std": 0.0,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.5,
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 6.0,
"completions/mean_terminated_length": 6.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.109375,
"frac_reward_zero_std": 0.875,
"grad_norm": 15.217447280883789,
"learning_rate": 5.741318238559209e-07,
"loss": -0.0,
"num_tokens": 5215425.0,
"reward": 0.5187499523162842,
"reward_std": 0.0578637532889843,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.46875,
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 418.0,
"completions/max_terminated_length": 418.0,
"completions/mean_length": 12.4375,
"completions/mean_terminated_length": 12.4375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.11160714285714286,
"frac_reward_zero_std": 0.625,
"grad_norm": 29.34973907470703,
"learning_rate": 5.577423184847931e-07,
"loss": 0.0,
"num_tokens": 5331149.0,
"reward": 0.26874998211860657,
"reward_std": 0.1462520956993103,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.0,
"rewards/mcq_exact_match_reward/mean": 0.21875,
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
"step": 50
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 5331149,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}