Model: cjiao/goldengoose-divsweep_goose_n512_indorc_tau1.00-7grp Source: Original Platform
1385 lines
49 KiB
JSON
1385 lines
49 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.11160714285714286,
|
|
"eval_steps": 500,
|
|
"global_step": 50,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1386.0,
|
|
"completions/mean_length": 515.03125,
|
|
"completions/mean_terminated_length": 490.6984558105469,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.002232142857142857,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.372617721557617,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.0,
|
|
"num_tokens": 116874.0,
|
|
"reward": 0.296875,
|
|
"reward_std": 0.36100417375564575,
|
|
"rewards/format_reward/mean": 0.3125,
|
|
"rewards/format_reward/std": 0.24397502839565277,
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 1
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1737.0,
|
|
"completions/max_terminated_length": 1737.0,
|
|
"completions/mean_length": 464.234375,
|
|
"completions/mean_terminated_length": 464.234375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.004464285714285714,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 9.42634105682373,
|
|
"learning_rate": 2e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 230473.0,
|
|
"reward": 0.33281248807907104,
|
|
"reward_std": 0.35629093647003174,
|
|
"rewards/format_reward/mean": 0.359375,
|
|
"rewards/format_reward/std": 0.28824523091316223,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 2
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1479.0,
|
|
"completions/max_terminated_length": 1479.0,
|
|
"completions/mean_length": 375.984375,
|
|
"completions/mean_terminated_length": 375.984375,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"epoch": 0.006696428571428571,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.815962791442871,
|
|
"learning_rate": 4e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 346168.0,
|
|
"reward": 0.21718749403953552,
|
|
"reward_std": 0.3329676389694214,
|
|
"rewards/format_reward/mean": 0.296875,
|
|
"rewards/format_reward/std": 0.2630521357059479,
|
|
"rewards/mcq_exact_match_reward/mean": 0.1875,
|
|
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
|
|
"step": 3
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0625,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1428.0,
|
|
"completions/mean_length": 490.515625,
|
|
"completions/mean_terminated_length": 386.683349609375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.008928571428571428,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 6.687461853027344,
|
|
"learning_rate": 6e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 486377.0,
|
|
"reward": 0.2593749761581421,
|
|
"reward_std": 0.3835652470588684,
|
|
"rewards/format_reward/mean": 0.40625,
|
|
"rewards/format_reward/std": 0.25,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 4
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1506.0,
|
|
"completions/mean_length": 606.28125,
|
|
"completions/mean_terminated_length": 535.3770141601562,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.011160714285714286,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 16.5729923248291,
|
|
"learning_rate": 8e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 620971.0,
|
|
"reward": 0.23984374105930328,
|
|
"reward_std": 0.3459582030773163,
|
|
"rewards/format_reward/mean": 0.3671875,
|
|
"rewards/format_reward/std": 0.23974503576755524,
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1333.0,
|
|
"completions/mean_length": 531.765625,
|
|
"completions/mean_terminated_length": 482.8548278808594,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.013392857142857142,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 7.315047264099121,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 739660.0,
|
|
"reward": 0.28203123807907104,
|
|
"reward_std": 0.23655115067958832,
|
|
"rewards/format_reward/mean": 0.3203125,
|
|
"rewards/format_reward/std": 0.27265870571136475,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 6
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1883.0,
|
|
"completions/max_terminated_length": 1883.0,
|
|
"completions/mean_length": 534.65625,
|
|
"completions/mean_terminated_length": 534.65625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.015625,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.712610244750977,
|
|
"learning_rate": 9.99726628670463e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 868742.0,
|
|
"reward": 0.3554687201976776,
|
|
"reward_std": 0.20643417537212372,
|
|
"rewards/format_reward/mean": 0.4296875,
|
|
"rewards/format_reward/std": 0.23345555365085602,
|
|
"rewards/mcq_exact_match_reward/mean": 0.3125,
|
|
"rewards/mcq_exact_match_reward/std": 0.467176616191864,
|
|
"step": 7
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1883.0,
|
|
"completions/mean_length": 639.4375,
|
|
"completions/mean_terminated_length": 570.1638793945312,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.017857142857142856,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.94524097442627,
|
|
"learning_rate": 9.989068136093872e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1012874.0,
|
|
"reward": 0.20781248807907104,
|
|
"reward_std": 0.30935126543045044,
|
|
"rewards/format_reward/mean": 0.359375,
|
|
"rewards/format_reward/std": 0.2741328477859497,
|
|
"rewards/mcq_exact_match_reward/mean": 0.171875,
|
|
"rewards/mcq_exact_match_reward/std": 0.38025420904159546,
|
|
"step": 8
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1730.0,
|
|
"completions/max_terminated_length": 1730.0,
|
|
"completions/mean_length": 345.8125,
|
|
"completions/mean_terminated_length": 345.8125,
|
|
"completions/min_length": 3.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"epoch": 0.020089285714285716,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 10.861742973327637,
|
|
"learning_rate": 9.975414512725056e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1141086.0,
|
|
"reward": 0.2593749761581421,
|
|
"reward_std": 0.2899813652038574,
|
|
"rewards/format_reward/mean": 0.40625,
|
|
"rewards/format_reward/std": 0.21593283116817474,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 9
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1430.0,
|
|
"completions/max_terminated_length": 1430.0,
|
|
"completions/mean_length": 466.390625,
|
|
"completions/mean_terminated_length": 466.390625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.022321428571428572,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 10.960936546325684,
|
|
"learning_rate": 9.956320346634875e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1260815.0,
|
|
"reward": 0.27421873807907104,
|
|
"reward_std": 0.3824688494205475,
|
|
"rewards/format_reward/mean": 0.3984375,
|
|
"rewards/format_reward/std": 0.28423789143562317,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.03125,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1601.0,
|
|
"completions/mean_length": 428.53125,
|
|
"completions/mean_terminated_length": 376.2903137207031,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.024553571428571428,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 18.18084716796875,
|
|
"learning_rate": 9.931806517013612e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1379769.0,
|
|
"reward": 0.23281249403953552,
|
|
"reward_std": 0.2880294919013977,
|
|
"rewards/format_reward/mean": 0.453125,
|
|
"rewards/format_reward/std": 0.2630521357059479,
|
|
"rewards/mcq_exact_match_reward/mean": 0.1875,
|
|
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
|
|
"step": 11
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 680.0,
|
|
"completions/max_terminated_length": 680.0,
|
|
"completions/mean_length": 135.90625,
|
|
"completions/mean_terminated_length": 135.90625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.026785714285714284,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 17.66448402404785,
|
|
"learning_rate": 9.901899829374047e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1470971.0,
|
|
"reward": 0.4703124761581421,
|
|
"reward_std": 0.28840553760528564,
|
|
"rewards/format_reward/mean": 0.484375,
|
|
"rewards/format_reward/std": 0.17747680842876434,
|
|
"rewards/mcq_exact_match_reward/mean": 0.421875,
|
|
"rewards/mcq_exact_match_reward/std": 0.49776285886764526,
|
|
"step": 12
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 1568.0,
|
|
"completions/max_terminated_length": 1568.0,
|
|
"completions/mean_length": 217.578125,
|
|
"completions/mean_terminated_length": 217.578125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.029017857142857144,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 10.263340950012207,
|
|
"learning_rate": 9.866632986240029e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1578096.0,
|
|
"reward": 0.2523437440395355,
|
|
"reward_std": 0.11321917921304703,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.203125,
|
|
"rewards/mcq_exact_match_reward/std": 0.40550529956817627,
|
|
"step": 13
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 842.0,
|
|
"completions/max_terminated_length": 842.0,
|
|
"completions/mean_length": 99.265625,
|
|
"completions/mean_terminated_length": 99.265625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.03125,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 18.594722747802734,
|
|
"learning_rate": 9.826044551386742e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1670537.0,
|
|
"reward": 0.42812496423721313,
|
|
"reward_std": 0.3584539592266083,
|
|
"rewards/format_reward/mean": 0.53125,
|
|
"rewards/format_reward/std": 0.17536810040473938,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 14
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 533.0,
|
|
"completions/max_terminated_length": 533.0,
|
|
"completions/mean_length": 27.75,
|
|
"completions/mean_terminated_length": 27.75,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.033482142857142856,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 10.696029663085938,
|
|
"learning_rate": 9.780178907671788e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1772369.0,
|
|
"reward": 0.5031249523162842,
|
|
"reward_std": 0.11100947111845016,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 15
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.03571428571428571,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 16.516544342041016,
|
|
"learning_rate": 9.729086208503173e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 1875425.0,
|
|
"reward": 0.40937498211860657,
|
|
"reward_std": 0.12255740165710449,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 16
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.03794642857142857,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 21.267547607421875,
|
|
"learning_rate": 9.672822322997304e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1988737.0,
|
|
"reward": 0.28437498211860657,
|
|
"reward_std": 0.26196980476379395,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 17
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.04017857142857143,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 20.922260284423828,
|
|
"learning_rate": 9.611448774886923e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2100009.0,
|
|
"reward": 0.7687499523162842,
|
|
"reward_std": 0.1872510462999344,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.71875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 18
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 59.0,
|
|
"completions/max_terminated_length": 59.0,
|
|
"completions/mean_length": 6.828125,
|
|
"completions/mean_terminated_length": 6.828125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.04241071428571429,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 22.29154396057129,
|
|
"learning_rate": 9.545032675245813e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2228990.0,
|
|
"reward": 0.39374998211860657,
|
|
"reward_std": 0.23356686532497406,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 19
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 40.0,
|
|
"completions/mean_length": 6.53125,
|
|
"completions/mean_terminated_length": 6.53125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.044642857142857144,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 17.501502990722656,
|
|
"learning_rate": 9.473646649103817e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2338600.0,
|
|
"reward": 0.4093749523162842,
|
|
"reward_std": 0.19044625759124756,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 20
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 19.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 7.3125,
|
|
"completions/mean_terminated_length": 7.3125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.046875,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 14.592903137207031,
|
|
"learning_rate": 9.397368756032444e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 2423444.0,
|
|
"reward": 0.19999998807907104,
|
|
"reward_std": 0.06365012377500534,
|
|
"rewards/format_reward/mean": 0.4375,
|
|
"rewards/format_reward/std": 0.1666666716337204,
|
|
"rewards/mcq_exact_match_reward/mean": 0.15625,
|
|
"rewards/mcq_exact_match_reward/std": 0.36596253514289856,
|
|
"step": 21
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.049107142857142856,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 15.976240158081055,
|
|
"learning_rate": 9.316282404787869e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2520124.0,
|
|
"reward": 0.31562498211860657,
|
|
"reward_std": 0.17782476544380188,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.265625,
|
|
"rewards/mcq_exact_match_reward/std": 0.44515693187713623,
|
|
"step": 22
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 540.0,
|
|
"completions/max_terminated_length": 540.0,
|
|
"completions/mean_length": 17.328125,
|
|
"completions/mean_terminated_length": 17.328125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.05133928571428571,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 14.27862548828125,
|
|
"learning_rate": 9.230476262104676e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2650241.0,
|
|
"reward": 0.5812499523162842,
|
|
"reward_std": 0.15520364046096802,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.53125,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 23
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 11.0,
|
|
"completions/max_terminated_length": 11.0,
|
|
"completions/mean_length": 6.078125,
|
|
"completions/mean_terminated_length": 6.078125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.05357142857142857,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 15.152713775634766,
|
|
"learning_rate": 9.1400441557401e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2770894.0,
|
|
"reward": 0.19062498211860657,
|
|
"reward_std": 0.2198972851037979,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.140625,
|
|
"rewards/mcq_exact_match_reward/std": 0.3503824472427368,
|
|
"step": 24
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 211.0,
|
|
"completions/max_terminated_length": 211.0,
|
|
"completions/mean_length": 9.203125,
|
|
"completions/mean_terminated_length": 9.203125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.05580357142857143,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 29.53569984436035,
|
|
"learning_rate": 9.045084971874737e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2841411.0,
|
|
"reward": 0.4406249523162842,
|
|
"reward_std": 0.17782476544380188,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.390625,
|
|
"rewards/mcq_exact_match_reward/std": 0.4917473793029785,
|
|
"step": 25
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.05803571428571429,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 23.521568298339844,
|
|
"learning_rate": 8.945702546981968e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 2926323.0,
|
|
"reward": 0.5031249523162842,
|
|
"reward_std": 0.15992167592048645,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 26
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.060267857142857144,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 17.729854583740234,
|
|
"learning_rate": 8.842005554284295e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3047355.0,
|
|
"reward": 0.4093749523162842,
|
|
"reward_std": 0.15992167592048645,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 27
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.0625,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 10.808194160461426,
|
|
"learning_rate": 8.734107384920769e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3145515.0,
|
|
"reward": 0.29999998211860657,
|
|
"reward_std": 0.0883883386850357,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.25,
|
|
"rewards/mcq_exact_match_reward/std": 0.4364357888698578,
|
|
"step": 28
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.06473214285714286,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 18.28078269958496,
|
|
"learning_rate": 8.622126023955445e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3238355.0,
|
|
"reward": 0.28437498211860657,
|
|
"reward_std": 0.16887322068214417,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.234375,
|
|
"rewards/mcq_exact_match_reward/std": 0.42695629596710205,
|
|
"step": 29
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 650.0,
|
|
"completions/max_terminated_length": 650.0,
|
|
"completions/mean_length": 16.0625,
|
|
"completions/mean_terminated_length": 16.0625,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.06696428571428571,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 12.452601432800293,
|
|
"learning_rate": 8.506183921362442e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3350191.0,
|
|
"reward": 0.3781249523162842,
|
|
"reward_std": 0.10205792635679245,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.328125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 30
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.06919642857142858,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 32.80612564086914,
|
|
"learning_rate": 8.386407858128706e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3443559.0,
|
|
"reward": 0.33124998211860657,
|
|
"reward_std": 0.21306739747524261,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.28125,
|
|
"rewards/mcq_exact_match_reward/std": 0.4531635046005249,
|
|
"step": 31
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 183.0,
|
|
"completions/max_terminated_length": 183.0,
|
|
"completions/mean_length": 10.84375,
|
|
"completions/mean_terminated_length": 10.84375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.07142857142857142,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 21.804208755493164,
|
|
"learning_rate": 8.262928807620843e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3553701.0,
|
|
"reward": 0.40937498211860657,
|
|
"reward_std": 0.2109457403421402,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 32
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.07366071428571429,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 15.285557746887207,
|
|
"learning_rate": 8.135881792367685e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3658485.0,
|
|
"reward": 0.40937498211860657,
|
|
"reward_std": 0.1530819833278656,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 33
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.07589285714285714,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 19.146930694580078,
|
|
"learning_rate": 8.005405736415125e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3753581.0,
|
|
"reward": 0.5031249523162842,
|
|
"reward_std": 0.11100947111845016,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 34
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.078125,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 12.522613525390625,
|
|
"learning_rate": 7.871643313414718e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 3851317.0,
|
|
"reward": 0.4562499523162842,
|
|
"reward_std": 0.0578637532889843,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.40625,
|
|
"rewards/mcq_exact_match_reward/std": 0.49501484632492065,
|
|
"step": 35
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 29.0,
|
|
"completions/max_terminated_length": 29.0,
|
|
"completions/mean_length": 6.578125,
|
|
"completions/mean_terminated_length": 6.578125,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.08035714285714286,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 38.662662506103516,
|
|
"learning_rate": 7.734740790612136e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3935226.0,
|
|
"reward": 0.34609371423721313,
|
|
"reward_std": 0.20160752534866333,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 36
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.08258928571428571,
|
|
"frac_reward_zero_std": 0.75,
|
|
"grad_norm": 32.49333572387695,
|
|
"learning_rate": 7.594847868906076e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4054722.0,
|
|
"reward": 0.5968749523162842,
|
|
"reward_std": 0.11100947856903076,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.546875,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 37
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.08482142857142858,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 38.19132614135742,
|
|
"learning_rate": 7.452117519152541e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4151506.0,
|
|
"reward": 0.7164062261581421,
|
|
"reward_std": 0.15529169142246246,
|
|
"rewards/format_reward/mean": 0.4453125,
|
|
"rewards/format_reward/std": 0.15728822350502014,
|
|
"rewards/mcq_exact_match_reward/mean": 0.671875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4732423722743988,
|
|
"step": 38
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.08705357142857142,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 30.554147720336914,
|
|
"learning_rate": 7.306705814893439e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4230210.0,
|
|
"reward": 0.3937499523162842,
|
|
"reward_std": 0.19727614521980286,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.34375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4787135720252991,
|
|
"step": 39
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.08928571428571429,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 23.661088943481445,
|
|
"learning_rate": 7.158771761692464e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4335138.0,
|
|
"reward": 0.34609371423721313,
|
|
"reward_std": 0.15529169142246246,
|
|
"rewards/format_reward/mean": 0.4921875,
|
|
"rewards/format_reward/std": 0.0625,
|
|
"rewards/mcq_exact_match_reward/mean": 0.296875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4604927599430084,
|
|
"step": 40
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.09151785714285714,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 32.43943786621094,
|
|
"learning_rate": 7.008477123264847e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4439562.0,
|
|
"reward": 0.23749998211860657,
|
|
"reward_std": 0.16675156354904175,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.1875,
|
|
"rewards/mcq_exact_match_reward/std": 0.39339789748191833,
|
|
"step": 41
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.09375,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 13.679961204528809,
|
|
"learning_rate": 6.855986244591103e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4555042.0,
|
|
"reward": 0.5031249523162842,
|
|
"reward_std": 0.13258251547813416,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.453125,
|
|
"rewards/mcq_exact_match_reward/std": 0.501733124256134,
|
|
"step": 42
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.09598214285714286,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.701465872208216e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4661210.0,
|
|
"reward": 0.4249999523162842,
|
|
"reward_std": 0.0,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.375,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 43
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 125.0,
|
|
"completions/max_terminated_length": 125.0,
|
|
"completions/mean_length": 7.859375,
|
|
"completions/mean_terminated_length": 7.859375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.09821428571428571,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 6.7772626876831055,
|
|
"learning_rate": 6.545084971874736e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4763577.0,
|
|
"reward": 0.4093749523162842,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.359375,
|
|
"rewards/mcq_exact_match_reward/std": 0.4836103618144989,
|
|
"step": 44
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.10044642857142858,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 29.50361442565918,
|
|
"learning_rate": 6.387014543809223e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 4850985.0,
|
|
"reward": 0.5499999523162842,
|
|
"reward_std": 0.16675156354904175,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 45
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.10267857142857142,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 14.329254150390625,
|
|
"learning_rate": 6.227427435703995e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 4946729.0,
|
|
"reward": 0.5187499523162842,
|
|
"reward_std": 0.0578637532889843,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 46
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.10491071428571429,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.066498153718734e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5027633.0,
|
|
"reward": 0.6749999523162842,
|
|
"reward_std": 0.0,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.625,
|
|
"rewards/mcq_exact_match_reward/std": 0.48795005679130554,
|
|
"step": 47
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.10714285714285714,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.90440267166055e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5123361.0,
|
|
"reward": 0.5499999523162842,
|
|
"reward_std": 0.0,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.5,
|
|
"rewards/mcq_exact_match_reward/std": 0.5039526224136353,
|
|
"step": 48
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 6.0,
|
|
"completions/max_terminated_length": 6.0,
|
|
"completions/mean_length": 6.0,
|
|
"completions/mean_terminated_length": 6.0,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.109375,
|
|
"frac_reward_zero_std": 0.875,
|
|
"grad_norm": 15.217447280883789,
|
|
"learning_rate": 5.741318238559209e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 5215425.0,
|
|
"reward": 0.5187499523162842,
|
|
"reward_std": 0.0578637532889843,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.46875,
|
|
"rewards/mcq_exact_match_reward/std": 0.5029674172401428,
|
|
"step": 49
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 418.0,
|
|
"completions/max_terminated_length": 418.0,
|
|
"completions/mean_length": 12.4375,
|
|
"completions/mean_terminated_length": 12.4375,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.11160714285714286,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 29.34973907470703,
|
|
"learning_rate": 5.577423184847931e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 5331149.0,
|
|
"reward": 0.26874998211860657,
|
|
"reward_std": 0.1462520956993103,
|
|
"rewards/format_reward/mean": 0.5,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/mcq_exact_match_reward/mean": 0.21875,
|
|
"rewards/mcq_exact_match_reward/std": 0.4166666865348816,
|
|
"step": 50
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 100,
|
|
"num_input_tokens_seen": 5331149,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 50,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|