3290 lines
135 KiB
JSON
3290 lines
135 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 500,
|
|
"global_step": 93,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 20.59375,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 1.1623296411707997,
|
|
"epoch": 0.010752688172043012,
|
|
"frac_reward_zero_std": 0.5625,
|
|
"grad_norm": 2.5920114517211914,
|
|
"learning_rate": 0.0,
|
|
"loss": -0.0655,
|
|
"num_tokens": 209228.0,
|
|
"reward": 0.10260416567325592,
|
|
"reward_std": 0.12079741060733795,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.01875000074505806,
|
|
"rewards/reward_coverage/std": 0.08886408805847168,
|
|
"rewards/reward_repetition/mean": 0.08385416865348816,
|
|
"rewards/reward_repetition/std": 0.1609596610069275,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.5810753107070923,
|
|
"sampling/importance_sampling_ratio/min": 8.851574766937428e-15,
|
|
"sampling/sampling_logp_difference/max": 32.35818099975586,
|
|
"sampling/sampling_logp_difference/mean": 3.2135589122772217,
|
|
"step": 1
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 19.34375,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 1.2037830371409655,
|
|
"epoch": 0.021505376344086023,
|
|
"frac_reward_zero_std": 0.65625,
|
|
"grad_norm": 1.522868275642395,
|
|
"learning_rate": 2.5000000000000004e-07,
|
|
"loss": -0.0672,
|
|
"num_tokens": 411382.0,
|
|
"reward": 0.06171875074505806,
|
|
"reward_std": 0.08728349208831787,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.0015625000232830644,
|
|
"rewards/reward_coverage/std": 0.01250000111758709,
|
|
"rewards/reward_repetition/mean": 0.06015624850988388,
|
|
"rewards/reward_repetition/std": 0.14286737143993378,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.572687029838562,
|
|
"sampling/importance_sampling_ratio/min": 6.514152938275704e-16,
|
|
"sampling/sampling_logp_difference/max": 34.967384338378906,
|
|
"sampling/sampling_logp_difference/mean": 3.2888572216033936,
|
|
"step": 2
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 18.015625,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 1.1908840090036392,
|
|
"epoch": 0.03225806451612903,
|
|
"frac_reward_zero_std": 0.78125,
|
|
"grad_norm": 2.1042428016662598,
|
|
"learning_rate": 5.000000000000001e-07,
|
|
"loss": -0.0415,
|
|
"num_tokens": 589899.0,
|
|
"reward": 0.03333333134651184,
|
|
"reward_std": 0.03609190881252289,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.0,
|
|
"rewards/reward_coverage/std": 0.0,
|
|
"rewards/reward_repetition/mean": 0.03333333134651184,
|
|
"rewards/reward_repetition/std": 0.09172075986862183,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.582237184047699,
|
|
"sampling/importance_sampling_ratio/min": 3.4790040246974845e-15,
|
|
"sampling/sampling_logp_difference/max": 33.292030334472656,
|
|
"sampling/sampling_logp_difference/mean": 3.1163713932037354,
|
|
"step": 3
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 18.1875,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 1.188672631047666,
|
|
"epoch": 0.043010752688172046,
|
|
"frac_reward_zero_std": 0.78125,
|
|
"grad_norm": 1.6742660999298096,
|
|
"learning_rate": 7.5e-07,
|
|
"loss": -0.0256,
|
|
"num_tokens": 784993.0,
|
|
"reward": 0.03932292014360428,
|
|
"reward_std": 0.046772170811891556,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.0,
|
|
"rewards/reward_coverage/std": 0.0,
|
|
"rewards/reward_repetition/mean": 0.03932292014360428,
|
|
"rewards/reward_repetition/std": 0.10880006849765778,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.5760836005210876,
|
|
"sampling/importance_sampling_ratio/min": 4.4242851705367583e-14,
|
|
"sampling/sampling_logp_difference/max": 30.749082565307617,
|
|
"sampling/sampling_logp_difference/mean": 3.1024224758148193,
|
|
"step": 4
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 19.296875,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 1.1812530495226383,
|
|
"epoch": 0.053763440860215055,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 3.190187931060791,
|
|
"learning_rate": 1.0000000000000002e-06,
|
|
"loss": -0.061,
|
|
"num_tokens": 994168.0,
|
|
"reward": 0.07604166865348816,
|
|
"reward_std": 0.10753916203975677,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.0062500000931322575,
|
|
"rewards/reward_coverage/std": 0.035073623061180115,
|
|
"rewards/reward_repetition/mean": 0.06979166716337204,
|
|
"rewards/reward_repetition/std": 0.1535550206899643,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.5558198690414429,
|
|
"sampling/importance_sampling_ratio/min": 1.0327031483188718e-17,
|
|
"sampling/sampling_logp_difference/max": 39.11176681518555,
|
|
"sampling/sampling_logp_difference/mean": 3.2322075366973877,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 18.40625,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 1.1822001421824098,
|
|
"epoch": 0.06451612903225806,
|
|
"frac_reward_zero_std": 0.71875,
|
|
"grad_norm": 2.3724122047424316,
|
|
"learning_rate": 1.25e-06,
|
|
"loss": -0.0569,
|
|
"num_tokens": 1177412.0,
|
|
"reward": 0.04401041567325592,
|
|
"reward_std": 0.062240131199359894,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.0,
|
|
"rewards/reward_coverage/std": 0.0,
|
|
"rewards/reward_repetition/mean": 0.04401041567325592,
|
|
"rewards/reward_repetition/std": 0.11309215426445007,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.5901535749435425,
|
|
"sampling/importance_sampling_ratio/min": 2.3622067538522137e-15,
|
|
"sampling/sampling_logp_difference/max": 33.67918014526367,
|
|
"sampling/sampling_logp_difference/mean": 3.0934648513793945,
|
|
"step": 6
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 20.203125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 1.1859856164082885,
|
|
"epoch": 0.07526881720430108,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 2.2745258808135986,
|
|
"learning_rate": 1.5e-06,
|
|
"loss": -0.07,
|
|
"num_tokens": 1379577.0,
|
|
"reward": 0.08828125149011612,
|
|
"reward_std": 0.10496117174625397,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.02031249925494194,
|
|
"rewards/reward_coverage/std": 0.08578246831893921,
|
|
"rewards/reward_repetition/mean": 0.06796875596046448,
|
|
"rewards/reward_repetition/std": 0.13983187079429626,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.5829952359199524,
|
|
"sampling/importance_sampling_ratio/min": 1.7831107242271977e-19,
|
|
"sampling/sampling_logp_difference/max": 43.17075729370117,
|
|
"sampling/sampling_logp_difference/mean": 3.094505786895752,
|
|
"step": 7
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 19.765625,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 1.1645366102457047,
|
|
"epoch": 0.08602150537634409,
|
|
"frac_reward_zero_std": 0.71875,
|
|
"grad_norm": 2.285911798477173,
|
|
"learning_rate": 1.75e-06,
|
|
"loss": -0.0463,
|
|
"num_tokens": 1562860.0,
|
|
"reward": 0.06822916865348816,
|
|
"reward_std": 0.05671586096286774,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.0,
|
|
"rewards/reward_coverage/std": 0.0,
|
|
"rewards/reward_repetition/mean": 0.06822916865348816,
|
|
"rewards/reward_repetition/std": 0.13447654247283936,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.6059508323669434,
|
|
"sampling/importance_sampling_ratio/min": 1.5984405334448e-15,
|
|
"sampling/sampling_logp_difference/max": 34.06974792480469,
|
|
"sampling/sampling_logp_difference/mean": 3.0470666885375977,
|
|
"step": 8
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 19.75,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 1.2150460854172707,
|
|
"epoch": 0.0967741935483871,
|
|
"frac_reward_zero_std": 0.5625,
|
|
"grad_norm": 2.4473977088928223,
|
|
"learning_rate": 2.0000000000000003e-06,
|
|
"loss": -0.0729,
|
|
"num_tokens": 1750970.0,
|
|
"reward": 0.07604166865348816,
|
|
"reward_std": 0.10753916203975677,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.004687500186264515,
|
|
"rewards/reward_coverage/std": 0.03750000149011612,
|
|
"rewards/reward_repetition/mean": 0.07135416567325592,
|
|
"rewards/reward_repetition/std": 0.14673006534576416,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.564116358757019,
|
|
"sampling/importance_sampling_ratio/min": 5.549738260316673e-19,
|
|
"sampling/sampling_logp_difference/max": 42.03536605834961,
|
|
"sampling/sampling_logp_difference/mean": 3.24617075920105,
|
|
"step": 9
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 20.125,
|
|
"completions/mean_terminated_length": 37.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 37.0,
|
|
"entropy": 1.1741499826312065,
|
|
"epoch": 0.10752688172043011,
|
|
"frac_reward_zero_std": 0.5625,
|
|
"grad_norm": 2.0894999504089355,
|
|
"learning_rate": 2.25e-06,
|
|
"loss": -0.0295,
|
|
"num_tokens": 1955314.0,
|
|
"reward": 0.08229167759418488,
|
|
"reward_std": 0.0766032412648201,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.0062500000931322575,
|
|
"rewards/reward_coverage/std": 0.05000000447034836,
|
|
"rewards/reward_repetition/mean": 0.07604166865348816,
|
|
"rewards/reward_repetition/std": 0.12633328139781952,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.5676860809326172,
|
|
"sampling/importance_sampling_ratio/min": 1.0015881904035955e-15,
|
|
"sampling/sampling_logp_difference/max": 34.53718948364258,
|
|
"sampling/sampling_logp_difference/mean": 3.318382978439331,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 20.375,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 1.19177012052387,
|
|
"epoch": 0.11827956989247312,
|
|
"frac_reward_zero_std": 0.625,
|
|
"grad_norm": 2.127878189086914,
|
|
"learning_rate": 2.5e-06,
|
|
"loss": -0.044,
|
|
"num_tokens": 2152810.0,
|
|
"reward": 0.08697916567325592,
|
|
"reward_std": 0.09207119792699814,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.0,
|
|
"rewards/reward_coverage/std": 0.0,
|
|
"rewards/reward_repetition/mean": 0.08697916567325592,
|
|
"rewards/reward_repetition/std": 0.1668112874031067,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.5731910467147827,
|
|
"sampling/importance_sampling_ratio/min": 5.365229931202564e-15,
|
|
"sampling/sampling_logp_difference/max": 32.85883712768555,
|
|
"sampling/sampling_logp_difference/mean": 3.2824923992156982,
|
|
"step": 11
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 20.640625,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 1.2268892796710134,
|
|
"epoch": 0.12903225806451613,
|
|
"frac_reward_zero_std": 0.53125,
|
|
"grad_norm": 2.617131233215332,
|
|
"learning_rate": 2.7500000000000004e-06,
|
|
"loss": -0.0849,
|
|
"num_tokens": 2354987.0,
|
|
"reward": 0.10390624403953552,
|
|
"reward_std": 0.1211656928062439,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.0078125,
|
|
"rewards/reward_coverage/std": 0.0625,
|
|
"rewards/reward_repetition/mean": 0.09609375149011612,
|
|
"rewards/reward_repetition/std": 0.16450294852256775,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.5544115900993347,
|
|
"sampling/importance_sampling_ratio/min": 7.363439390216395e-18,
|
|
"sampling/sampling_logp_difference/max": 39.45000457763672,
|
|
"sampling/sampling_logp_difference/mean": 3.3240010738372803,
|
|
"step": 12
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 34.0,
|
|
"completions/mean_length": 23.421875,
|
|
"completions/mean_terminated_length": 34.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 34.0,
|
|
"entropy": 1.1650395467877388,
|
|
"epoch": 0.13978494623655913,
|
|
"frac_reward_zero_std": 0.375,
|
|
"grad_norm": 2.670605182647705,
|
|
"learning_rate": 3e-06,
|
|
"loss": -0.0758,
|
|
"num_tokens": 2616268.0,
|
|
"reward": 0.18046876788139343,
|
|
"reward_std": 0.17567184567451477,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.04218750074505806,
|
|
"rewards/reward_coverage/std": 0.13190266489982605,
|
|
"rewards/reward_repetition/mean": 0.13828124105930328,
|
|
"rewards/reward_repetition/std": 0.18624858558177948,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.5682023167610168,
|
|
"sampling/importance_sampling_ratio/min": 1.0665693960122588e-19,
|
|
"sampling/sampling_logp_difference/max": 43.684669494628906,
|
|
"sampling/sampling_logp_difference/mean": 3.6000094413757324,
|
|
"step": 13
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.96875,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 34.0,
|
|
"completions/mean_length": 27.984375,
|
|
"completions/mean_terminated_length": 34.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 34.0,
|
|
"entropy": 1.111030412837863,
|
|
"epoch": 0.15053763440860216,
|
|
"frac_reward_zero_std": 0.21875,
|
|
"grad_norm": 2.47416615486145,
|
|
"learning_rate": 3.2500000000000002e-06,
|
|
"loss": -0.0662,
|
|
"num_tokens": 2906261.0,
|
|
"reward": 0.2278645932674408,
|
|
"reward_std": 0.17272555828094482,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.03593749925494194,
|
|
"rewards/reward_coverage/std": 0.09489709138870239,
|
|
"rewards/reward_repetition/mean": 0.19192710518836975,
|
|
"rewards/reward_repetition/std": 0.19924886524677277,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.540465772151947,
|
|
"sampling/importance_sampling_ratio/min": 4.225438084884613e-17,
|
|
"sampling/sampling_logp_difference/max": 37.702823638916016,
|
|
"sampling/sampling_logp_difference/mean": 3.798464298248291,
|
|
"step": 14
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.96875,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 28.203125,
|
|
"completions/mean_terminated_length": 35.5,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 34.0,
|
|
"entropy": 1.0863127624616027,
|
|
"epoch": 0.16129032258064516,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 3.092238664627075,
|
|
"learning_rate": 3.5e-06,
|
|
"loss": -0.0628,
|
|
"num_tokens": 3202938.0,
|
|
"reward": 0.30156248807907104,
|
|
"reward_std": 0.19813722372055054,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.0390625,
|
|
"rewards/reward_coverage/std": 0.10483121871948242,
|
|
"rewards/reward_repetition/mean": 0.26249998807907104,
|
|
"rewards/reward_repetition/std": 0.20803949236869812,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.5526077151298523,
|
|
"sampling/importance_sampling_ratio/min": 3.7098329921141575e-20,
|
|
"sampling/sampling_logp_difference/max": 44.74071502685547,
|
|
"sampling/sampling_logp_difference/mean": 3.7845804691314697,
|
|
"step": 15
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.953125,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 30.90625,
|
|
"completions/mean_terminated_length": 35.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 34.0,
|
|
"entropy": 1.0002785623073578,
|
|
"epoch": 0.17204301075268819,
|
|
"frac_reward_zero_std": 0.09375,
|
|
"grad_norm": 2.3450825214385986,
|
|
"learning_rate": 3.7500000000000005e-06,
|
|
"loss": -0.0623,
|
|
"num_tokens": 3523934.0,
|
|
"reward": 0.41588544845581055,
|
|
"reward_std": 0.2868938446044922,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.08281250298023224,
|
|
"rewards/reward_coverage/std": 0.15384459495544434,
|
|
"rewards/reward_repetition/mean": 0.3330729007720947,
|
|
"rewards/reward_repetition/std": 0.2281644642353058,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.544209361076355,
|
|
"sampling/importance_sampling_ratio/min": 1.4981450562263129e-16,
|
|
"sampling/sampling_logp_difference/max": 36.4371337890625,
|
|
"sampling/sampling_logp_difference/mean": 4.021841526031494,
|
|
"step": 16
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.90625,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 33.53125,
|
|
"completions/mean_terminated_length": 34.333335876464844,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 33.0,
|
|
"entropy": 0.8482576478272676,
|
|
"epoch": 0.1827956989247312,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 1.6775325536727905,
|
|
"learning_rate": 4.000000000000001e-06,
|
|
"loss": -0.0187,
|
|
"num_tokens": 3875784.0,
|
|
"reward": 0.5098958015441895,
|
|
"reward_std": 0.20402978360652924,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.13437500596046448,
|
|
"rewards/reward_coverage/std": 0.16639859974384308,
|
|
"rewards/reward_repetition/mean": 0.37552082538604736,
|
|
"rewards/reward_repetition/std": 0.17110413312911987,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.5547957420349121,
|
|
"sampling/importance_sampling_ratio/min": 1.9580492778949622e-20,
|
|
"sampling/sampling_logp_difference/max": 45.37975311279297,
|
|
"sampling/sampling_logp_difference/mean": 3.881425142288208,
|
|
"step": 17
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.921875,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 35.75,
|
|
"completions/mean_terminated_length": 36.400001525878906,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 34.0,
|
|
"entropy": 0.7285963352769613,
|
|
"epoch": 0.1935483870967742,
|
|
"frac_reward_zero_std": 0.1875,
|
|
"grad_norm": 1.9497219324111938,
|
|
"learning_rate": 4.25e-06,
|
|
"loss": -0.0222,
|
|
"num_tokens": 4230450.0,
|
|
"reward": 0.6015625596046448,
|
|
"reward_std": 0.17456699907779694,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.15000000596046448,
|
|
"rewards/reward_coverage/std": 0.18856181204319,
|
|
"rewards/reward_repetition/mean": 0.4515624940395355,
|
|
"rewards/reward_repetition/std": 0.16296739876270294,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.5840170383453369,
|
|
"sampling/importance_sampling_ratio/min": 1.3230608049300072e-14,
|
|
"sampling/sampling_logp_difference/max": 31.95624351501465,
|
|
"sampling/sampling_logp_difference/mean": 3.7303450107574463,
|
|
"step": 18
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.90625,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 35.5,
|
|
"completions/mean_terminated_length": 35.333335876464844,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 30.0,
|
|
"entropy": 0.7492767116054893,
|
|
"epoch": 0.20430107526881722,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 2.027407169342041,
|
|
"learning_rate": 4.5e-06,
|
|
"loss": -0.0178,
|
|
"num_tokens": 4592424.0,
|
|
"reward": 0.5973958373069763,
|
|
"reward_std": 0.2113954722881317,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.13124999403953552,
|
|
"rewards/reward_coverage/std": 0.1670234352350235,
|
|
"rewards/reward_repetition/mean": 0.4661458432674408,
|
|
"rewards/reward_repetition/std": 0.17673227190971375,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.5936897993087769,
|
|
"sampling/importance_sampling_ratio/min": 3.848576383998589e-20,
|
|
"sampling/sampling_logp_difference/max": 44.70399856567383,
|
|
"sampling/sampling_logp_difference/mean": 3.7467870712280273,
|
|
"step": 19
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.859375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 37.25,
|
|
"completions/mean_terminated_length": 36.33333206176758,
|
|
"completions/min_length": 24.0,
|
|
"completions/min_terminated_length": 34.0,
|
|
"entropy": 0.6496950350701809,
|
|
"epoch": 0.21505376344086022,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 1.7340210676193237,
|
|
"learning_rate": 4.75e-06,
|
|
"loss": -0.0028,
|
|
"num_tokens": 4973854.0,
|
|
"reward": 0.6968749761581421,
|
|
"reward_std": 0.1944543719291687,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.1875,
|
|
"rewards/reward_coverage/std": 0.17320507764816284,
|
|
"rewards/reward_repetition/mean": 0.5093749761581421,
|
|
"rewards/reward_repetition/std": 0.14407385885715485,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.6289246678352356,
|
|
"sampling/importance_sampling_ratio/min": 1.4430105039911333e-19,
|
|
"sampling/sampling_logp_difference/max": 43.38238525390625,
|
|
"sampling/sampling_logp_difference/mean": 3.522400379180908,
|
|
"step": 20
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.9375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 37.96875,
|
|
"completions/mean_terminated_length": 37.0,
|
|
"completions/min_length": 16.0,
|
|
"completions/min_terminated_length": 37.0,
|
|
"entropy": 0.6573502826504409,
|
|
"epoch": 0.22580645161290322,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.9576199054718018,
|
|
"learning_rate": 5e-06,
|
|
"loss": -0.0085,
|
|
"num_tokens": 5357700.0,
|
|
"reward": 0.715624988079071,
|
|
"reward_std": 0.1944543719291687,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.17499999701976776,
|
|
"rewards/reward_coverage/std": 0.1736626923084259,
|
|
"rewards/reward_repetition/mean": 0.5406249761581421,
|
|
"rewards/reward_repetition/std": 0.1649615317583084,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.666456401348114,
|
|
"sampling/importance_sampling_ratio/min": 1.55370423422568e-14,
|
|
"sampling/sampling_logp_difference/max": 31.795549392700195,
|
|
"sampling/sampling_logp_difference/mean": 3.181442975997925,
|
|
"step": 21
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.859375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 38.859375,
|
|
"completions/mean_terminated_length": 36.66666793823242,
|
|
"completions/min_length": 34.0,
|
|
"completions/min_terminated_length": 34.0,
|
|
"entropy": 0.5784921627491713,
|
|
"epoch": 0.23655913978494625,
|
|
"frac_reward_zero_std": 0.09375,
|
|
"grad_norm": 1.6491539478302002,
|
|
"learning_rate": 4.931506849315069e-06,
|
|
"loss": 0.009,
|
|
"num_tokens": 5746525.0,
|
|
"reward": 0.792187511920929,
|
|
"reward_std": 0.1834058165550232,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20156249403953552,
|
|
"rewards/reward_coverage/std": 0.17772118747234344,
|
|
"rewards/reward_repetition/mean": 0.5906250476837158,
|
|
"rewards/reward_repetition/std": 0.1399759203195572,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.6919029355049133,
|
|
"sampling/importance_sampling_ratio/min": 2.5034810497841535e-16,
|
|
"sampling/sampling_logp_difference/max": 35.92367935180664,
|
|
"sampling/sampling_logp_difference/mean": 3.11592435836792,
|
|
"step": 22
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.96875,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 39.03125,
|
|
"completions/mean_terminated_length": 35.5,
|
|
"completions/min_length": 32.0,
|
|
"completions/min_terminated_length": 34.0,
|
|
"entropy": 0.5693019391037524,
|
|
"epoch": 0.24731182795698925,
|
|
"frac_reward_zero_std": 0.1875,
|
|
"grad_norm": 1.9411438703536987,
|
|
"learning_rate": 4.863013698630138e-06,
|
|
"loss": -0.0017,
|
|
"num_tokens": 6135539.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.19887377321720123,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20156249403953552,
|
|
"rewards/reward_coverage/std": 0.16855332255363464,
|
|
"rewards/reward_repetition/mean": 0.6578124761581421,
|
|
"rewards/reward_repetition/std": 0.1950211226940155,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.6962894201278687,
|
|
"sampling/importance_sampling_ratio/min": 5.379221697881343e-17,
|
|
"sampling/sampling_logp_difference/max": 37.461402893066406,
|
|
"sampling/sampling_logp_difference/mean": 2.938572645187378,
|
|
"step": 23
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.921875,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 39.109375,
|
|
"completions/mean_terminated_length": 35.20000076293945,
|
|
"completions/min_length": 33.0,
|
|
"completions/min_terminated_length": 33.0,
|
|
"entropy": 0.521631199400872,
|
|
"epoch": 0.25806451612903225,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 2.231412172317505,
|
|
"learning_rate": 4.7945205479452054e-06,
|
|
"loss": -0.0117,
|
|
"num_tokens": 6524620.0,
|
|
"reward": 0.862500011920929,
|
|
"reward_std": 0.20329320430755615,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.18437500298023224,
|
|
"rewards/reward_coverage/std": 0.1801399439573288,
|
|
"rewards/reward_repetition/mean": 0.6781250238418579,
|
|
"rewards/reward_repetition/std": 0.1656816154718399,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7251441478729248,
|
|
"sampling/importance_sampling_ratio/min": 1.906062915100565e-20,
|
|
"sampling/sampling_logp_difference/max": 45.40666198730469,
|
|
"sampling/sampling_logp_difference/mean": 2.8243846893310547,
|
|
"step": 24
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.96875,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 36.0,
|
|
"completions/mean_length": 39.484375,
|
|
"completions/mean_terminated_length": 36.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 36.0,
|
|
"entropy": 0.46869752556085587,
|
|
"epoch": 0.26881720430107525,
|
|
"frac_reward_zero_std": 0.0625,
|
|
"grad_norm": 1.5307422876358032,
|
|
"learning_rate": 4.726027397260274e-06,
|
|
"loss": -0.009,
|
|
"num_tokens": 6914737.0,
|
|
"reward": 0.9578125476837158,
|
|
"reward_std": 0.19224464893341064,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.2109375,
|
|
"rewards/reward_coverage/std": 0.16914087533950806,
|
|
"rewards/reward_repetition/mean": 0.7468750476837158,
|
|
"rewards/reward_repetition/std": 0.1603258103132248,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7427772879600525,
|
|
"sampling/importance_sampling_ratio/min": 6.379089188001887e-19,
|
|
"sampling/sampling_logp_difference/max": 41.89609146118164,
|
|
"sampling/sampling_logp_difference/mean": 2.6809170246124268,
|
|
"step": 25
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.953125,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 39.53125,
|
|
"completions/mean_terminated_length": 37.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 37.0,
|
|
"entropy": 0.44641283014789224,
|
|
"epoch": 0.27956989247311825,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 1.65163254737854,
|
|
"learning_rate": 4.657534246575343e-06,
|
|
"loss": -0.0002,
|
|
"num_tokens": 7305211.0,
|
|
"reward": 0.9937499761581421,
|
|
"reward_std": 0.18119609355926514,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.171875,
|
|
"rewards/reward_coverage/std": 0.15272004902362823,
|
|
"rewards/reward_repetition/mean": 0.8218749761581421,
|
|
"rewards/reward_repetition/std": 0.15580691397190094,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.742205023765564,
|
|
"sampling/importance_sampling_ratio/min": 1.740283318765294e-20,
|
|
"sampling/sampling_logp_difference/max": 45.49765396118164,
|
|
"sampling/sampling_logp_difference/mean": 2.5815634727478027,
|
|
"step": 26
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.5625,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.39991177897900343,
|
|
"epoch": 0.2903225806451613,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 2.0949409008026123,
|
|
"learning_rate": 4.589041095890411e-06,
|
|
"loss": -0.0053,
|
|
"num_tokens": 7695441.0,
|
|
"reward": 1.0,
|
|
"reward_std": 0.2121320217847824,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.16874998807907104,
|
|
"rewards/reward_coverage/std": 0.1780627816915512,
|
|
"rewards/reward_repetition/mean": 0.831250011920929,
|
|
"rewards/reward_repetition/std": 0.14786845445632935,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7544648051261902,
|
|
"sampling/importance_sampling_ratio/min": 6.676384620995922e-19,
|
|
"sampling/sampling_logp_difference/max": 41.85054016113281,
|
|
"sampling/sampling_logp_difference/mean": 2.4634246826171875,
|
|
"step": 27
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.953125,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 39.5625,
|
|
"completions/mean_terminated_length": 37.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 37.0,
|
|
"entropy": 0.3994480683468282,
|
|
"epoch": 0.3010752688172043,
|
|
"frac_reward_zero_std": 0.09375,
|
|
"grad_norm": 1.35282301902771,
|
|
"learning_rate": 4.52054794520548e-06,
|
|
"loss": -0.0062,
|
|
"num_tokens": 8086007.0,
|
|
"reward": 1.015625,
|
|
"reward_std": 0.20329320430755615,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.21250000596046448,
|
|
"rewards/reward_coverage/std": 0.1685606688261032,
|
|
"rewards/reward_repetition/mean": 0.8031250238418579,
|
|
"rewards/reward_repetition/std": 0.15732762217521667,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7630195617675781,
|
|
"sampling/importance_sampling_ratio/min": 6.418834864883299e-17,
|
|
"sampling/sampling_logp_difference/max": 37.28470993041992,
|
|
"sampling/sampling_logp_difference/mean": 2.45412540435791,
|
|
"step": 28
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.96875,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 39.609375,
|
|
"completions/mean_terminated_length": 36.5,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 36.0,
|
|
"entropy": 0.33232213323935866,
|
|
"epoch": 0.3118279569892473,
|
|
"frac_reward_zero_std": 0.21875,
|
|
"grad_norm": 1.4301323890686035,
|
|
"learning_rate": 4.4520547945205486e-06,
|
|
"loss": -0.0026,
|
|
"num_tokens": 8476412.0,
|
|
"reward": 1.0406250953674316,
|
|
"reward_std": 0.15026018023490906,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.21562501788139343,
|
|
"rewards/reward_coverage/std": 0.14498905837535858,
|
|
"rewards/reward_repetition/mean": 0.824999988079071,
|
|
"rewards/reward_repetition/std": 0.14474937319755554,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7893605828285217,
|
|
"sampling/importance_sampling_ratio/min": 6.775734351848958e-14,
|
|
"sampling/sampling_logp_difference/max": 30.322843551635742,
|
|
"sampling/sampling_logp_difference/mean": 2.41098690032959,
|
|
"step": 29
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.65625,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.31409848271869123,
|
|
"epoch": 0.3225806451612903,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.2845097780227661,
|
|
"learning_rate": 4.383561643835616e-06,
|
|
"loss": -0.01,
|
|
"num_tokens": 8866724.0,
|
|
"reward": 1.0109374523162842,
|
|
"reward_std": 0.1480504870414734,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.18593749403953552,
|
|
"rewards/reward_coverage/std": 0.13076962530612946,
|
|
"rewards/reward_repetition/mean": 0.824999988079071,
|
|
"rewards/reward_repetition/std": 0.14907118678092957,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7840973734855652,
|
|
"sampling/importance_sampling_ratio/min": 1.471832771813246e-16,
|
|
"sampling/sampling_logp_difference/max": 36.45485305786133,
|
|
"sampling/sampling_logp_difference/mean": 2.4330239295959473,
|
|
"step": 30
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.796875,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 33.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.3091448312625289,
|
|
"epoch": 0.3333333333333333,
|
|
"frac_reward_zero_std": 0.0625,
|
|
"grad_norm": 1.2038891315460205,
|
|
"learning_rate": 4.315068493150685e-06,
|
|
"loss": -0.0041,
|
|
"num_tokens": 9257223.0,
|
|
"reward": 1.017187476158142,
|
|
"reward_std": 0.20550289750099182,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20468750596046448,
|
|
"rewards/reward_coverage/std": 0.1803257167339325,
|
|
"rewards/reward_repetition/mean": 0.8125,
|
|
"rewards/reward_repetition/std": 0.1374368518590927,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7869740128517151,
|
|
"sampling/importance_sampling_ratio/min": 8.298585072157721e-15,
|
|
"sampling/sampling_logp_difference/max": 32.422691345214844,
|
|
"sampling/sampling_logp_difference/mean": 2.4976119995117188,
|
|
"step": 31
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 36.0,
|
|
"completions/mean_length": 39.890625,
|
|
"completions/mean_terminated_length": 36.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 36.0,
|
|
"entropy": 0.2876437115482986,
|
|
"epoch": 0.34408602150537637,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 0.9871541261672974,
|
|
"learning_rate": 4.246575342465754e-06,
|
|
"loss": 0.0037,
|
|
"num_tokens": 9647784.0,
|
|
"reward": 1.032812476158142,
|
|
"reward_std": 0.17456698417663574,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.18281251192092896,
|
|
"rewards/reward_coverage/std": 0.15384458005428314,
|
|
"rewards/reward_repetition/mean": 0.8500000238418579,
|
|
"rewards/reward_repetition/std": 0.1380131095647812,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7916483283042908,
|
|
"sampling/importance_sampling_ratio/min": 1.612893541327095e-14,
|
|
"sampling/sampling_logp_difference/max": 31.758161544799805,
|
|
"sampling/sampling_logp_difference/mean": 2.4329771995544434,
|
|
"step": 32
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.796875,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.27647654921747744,
|
|
"epoch": 0.3548387096774194,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 1.4954354763031006,
|
|
"learning_rate": 4.178082191780822e-06,
|
|
"loss": -0.0059,
|
|
"num_tokens": 10038289.0,
|
|
"reward": 1.0218749046325684,
|
|
"reward_std": 0.18119610846042633,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20937499403953552,
|
|
"rewards/reward_coverage/std": 0.1687665432691574,
|
|
"rewards/reward_repetition/mean": 0.8125,
|
|
"rewards/reward_repetition/std": 0.12279806286096573,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7887318730354309,
|
|
"sampling/importance_sampling_ratio/min": 8.831320397643036e-19,
|
|
"sampling/sampling_logp_difference/max": 41.5708122253418,
|
|
"sampling/sampling_logp_difference/mean": 2.5059313774108887,
|
|
"step": 33
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.84375,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.27092319959774613,
|
|
"epoch": 0.3655913978494624,
|
|
"frac_reward_zero_std": 0.09375,
|
|
"grad_norm": 1.6384416818618774,
|
|
"learning_rate": 4.109589041095891e-06,
|
|
"loss": -0.0034,
|
|
"num_tokens": 10428797.0,
|
|
"reward": 1.0593750476837158,
|
|
"reward_std": 0.17235726118087769,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.23125001788139343,
|
|
"rewards/reward_coverage/std": 0.15314172208309174,
|
|
"rewards/reward_repetition/mean": 0.828125,
|
|
"rewards/reward_repetition/std": 0.12782520055770874,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7671718597412109,
|
|
"sampling/importance_sampling_ratio/min": 6.987482079498287e-19,
|
|
"sampling/sampling_logp_difference/max": 41.804996490478516,
|
|
"sampling/sampling_logp_difference/mean": 2.6257989406585693,
|
|
"step": 34
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.8125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.2952587741892785,
|
|
"epoch": 0.3763440860215054,
|
|
"frac_reward_zero_std": 0.1875,
|
|
"grad_norm": 1.0413151979446411,
|
|
"learning_rate": 4.0410958904109595e-06,
|
|
"loss": -0.0099,
|
|
"num_tokens": 10819301.0,
|
|
"reward": 0.9296875596046448,
|
|
"reward_std": 0.13921163976192474,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.16093750298023224,
|
|
"rewards/reward_coverage/std": 0.12550494074821472,
|
|
"rewards/reward_repetition/mean": 0.768750011920929,
|
|
"rewards/reward_repetition/std": 0.11391307413578033,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7573365569114685,
|
|
"sampling/importance_sampling_ratio/min": 1.5061544475743168e-18,
|
|
"sampling/sampling_logp_difference/max": 41.03697204589844,
|
|
"sampling/sampling_logp_difference/mean": 2.6861050128936768,
|
|
"step": 35
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 37.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 37.0,
|
|
"entropy": 0.27068308740854263,
|
|
"epoch": 0.3870967741935484,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 1.277216911315918,
|
|
"learning_rate": 3.972602739726027e-06,
|
|
"loss": -0.0004,
|
|
"num_tokens": 11209548.0,
|
|
"reward": 0.989062488079071,
|
|
"reward_std": 0.17014756798744202,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.21406251192092896,
|
|
"rewards/reward_coverage/std": 0.17262418568134308,
|
|
"rewards/reward_repetition/mean": 0.7749999761581421,
|
|
"rewards/reward_repetition/std": 0.1154700517654419,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7654402256011963,
|
|
"sampling/importance_sampling_ratio/min": 4.38677482468737e-14,
|
|
"sampling/sampling_logp_difference/max": 30.757596969604492,
|
|
"sampling/sampling_logp_difference/mean": 2.659276008605957,
|
|
"step": 36
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 36.0,
|
|
"completions/mean_length": 39.796875,
|
|
"completions/mean_terminated_length": 36.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 36.0,
|
|
"entropy": 0.2760939297731966,
|
|
"epoch": 0.3978494623655914,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 1.504520297050476,
|
|
"learning_rate": 3.904109589041096e-06,
|
|
"loss": -0.0042,
|
|
"num_tokens": 11599953.0,
|
|
"reward": 0.9546874761581421,
|
|
"reward_std": 0.15246990323066711,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.17656250298023224,
|
|
"rewards/reward_coverage/std": 0.1318274438381195,
|
|
"rewards/reward_repetition/mean": 0.7781250476837158,
|
|
"rewards/reward_repetition/std": 0.12404395639896393,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7589582800865173,
|
|
"sampling/importance_sampling_ratio/min": 1.2950150141564556e-22,
|
|
"sampling/sampling_logp_difference/max": 50.39834976196289,
|
|
"sampling/sampling_logp_difference/mean": 2.7161073684692383,
|
|
"step": 37
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 40.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 40.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.27736608777195215,
|
|
"epoch": 0.40860215053763443,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.6770557165145874,
|
|
"learning_rate": 3.8356164383561645e-06,
|
|
"loss": -0.0018,
|
|
"num_tokens": 11990363.0,
|
|
"reward": 0.926562488079071,
|
|
"reward_std": 0.14363107085227966,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.17031250894069672,
|
|
"rewards/reward_coverage/std": 0.15293914079666138,
|
|
"rewards/reward_repetition/mean": 0.7562500238418579,
|
|
"rewards/reward_repetition/std": 0.1152980849146843,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7690252661705017,
|
|
"sampling/importance_sampling_ratio/min": 5.327883295646056e-16,
|
|
"sampling/sampling_logp_difference/max": 35.16840744018555,
|
|
"sampling/sampling_logp_difference/mean": 2.6216495037078857,
|
|
"step": 38
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 39.8125,
|
|
"completions/mean_terminated_length": 37.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 37.0,
|
|
"entropy": 0.30535601382143795,
|
|
"epoch": 0.41935483870967744,
|
|
"frac_reward_zero_std": 0.09375,
|
|
"grad_norm": 1.455296277999878,
|
|
"learning_rate": 3.767123287671233e-06,
|
|
"loss": -0.0019,
|
|
"num_tokens": 12380581.0,
|
|
"reward": 0.948437511920929,
|
|
"reward_std": 0.2010834813117981,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.1796875,
|
|
"rewards/reward_coverage/std": 0.16825877130031586,
|
|
"rewards/reward_repetition/mean": 0.7687499523162842,
|
|
"rewards/reward_repetition/std": 0.12456272542476654,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7560758590698242,
|
|
"sampling/importance_sampling_ratio/min": 1.0004220113236836e-18,
|
|
"sampling/sampling_logp_difference/max": 41.446109771728516,
|
|
"sampling/sampling_logp_difference/mean": 2.5334110260009766,
|
|
"step": 39
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.90625,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.30565810902044177,
|
|
"epoch": 0.43010752688172044,
|
|
"frac_reward_zero_std": 0.03125,
|
|
"grad_norm": 1.2029423713684082,
|
|
"learning_rate": 3.6986301369863014e-06,
|
|
"loss": -0.0015,
|
|
"num_tokens": 12770797.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.19003495573997498,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.19062501192092896,
|
|
"rewards/reward_coverage/std": 0.15504096448421478,
|
|
"rewards/reward_repetition/mean": 0.778124988079071,
|
|
"rewards/reward_repetition/std": 0.12404395639896393,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7597452402114868,
|
|
"sampling/importance_sampling_ratio/min": 1.379195585862747e-12,
|
|
"sampling/sampling_logp_difference/max": 27.309520721435547,
|
|
"sampling/sampling_logp_difference/mean": 2.4547033309936523,
|
|
"step": 40
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 33.0,
|
|
"completions/mean_length": 39.6875,
|
|
"completions/mean_terminated_length": 33.0,
|
|
"completions/min_length": 33.0,
|
|
"completions/min_terminated_length": 33.0,
|
|
"entropy": 0.31394060072489083,
|
|
"epoch": 0.44086021505376344,
|
|
"frac_reward_zero_std": 0.0625,
|
|
"grad_norm": 1.7512142658233643,
|
|
"learning_rate": 3.6301369863013704e-06,
|
|
"loss": 0.0003,
|
|
"num_tokens": 13161191.0,
|
|
"reward": 1.009374976158142,
|
|
"reward_std": 0.18561550974845886,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.22499999403953552,
|
|
"rewards/reward_coverage/std": 0.15936382114887238,
|
|
"rewards/reward_repetition/mean": 0.7843749523162842,
|
|
"rewards/reward_repetition/std": 0.13940775394439697,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7532055974006653,
|
|
"sampling/importance_sampling_ratio/min": 3.4107995156513595e-17,
|
|
"sampling/sampling_logp_difference/max": 37.91699981689453,
|
|
"sampling/sampling_logp_difference/mean": 2.571798086166382,
|
|
"step": 41
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 40.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 40.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.301532520679757,
|
|
"epoch": 0.45161290322580644,
|
|
"frac_reward_zero_std": 0.1875,
|
|
"grad_norm": 1.6585807800292969,
|
|
"learning_rate": 3.5616438356164386e-06,
|
|
"loss": -0.0042,
|
|
"num_tokens": 13551781.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.13258251547813416,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.19375000894069672,
|
|
"rewards/reward_coverage/std": 0.1562202423810959,
|
|
"rewards/reward_repetition/mean": 0.7593749761581421,
|
|
"rewards/reward_repetition/std": 0.10796640068292618,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7576152682304382,
|
|
"sampling/importance_sampling_ratio/min": 4.25440330295826e-18,
|
|
"sampling/sampling_logp_difference/max": 39.99857711791992,
|
|
"sampling/sampling_logp_difference/mean": 2.5987966060638428,
|
|
"step": 42
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.2994292816147208,
|
|
"epoch": 0.46236559139784944,
|
|
"frac_reward_zero_std": 0.1875,
|
|
"grad_norm": 1.1511462926864624,
|
|
"learning_rate": 3.4931506849315072e-06,
|
|
"loss": -0.0023,
|
|
"num_tokens": 13941920.0,
|
|
"reward": 0.9578125476837158,
|
|
"reward_std": 0.12595339119434357,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.19218748807907104,
|
|
"rewards/reward_coverage/std": 0.15045401453971863,
|
|
"rewards/reward_repetition/mean": 0.765625,
|
|
"rewards/reward_repetition/std": 0.10422617197036743,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7622615694999695,
|
|
"sampling/importance_sampling_ratio/min": 1.1524427466063367e-15,
|
|
"sampling/sampling_logp_difference/max": 34.39689254760742,
|
|
"sampling/sampling_logp_difference/mean": 2.527230978012085,
|
|
"step": 43
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 36.0,
|
|
"completions/mean_length": 39.9375,
|
|
"completions/mean_terminated_length": 36.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 36.0,
|
|
"entropy": 0.2939116738270968,
|
|
"epoch": 0.4731182795698925,
|
|
"frac_reward_zero_std": 0.21875,
|
|
"grad_norm": 1.2386677265167236,
|
|
"learning_rate": 3.4246575342465754e-06,
|
|
"loss": -0.0099,
|
|
"num_tokens": 14332340.0,
|
|
"reward": 0.9515625238418579,
|
|
"reward_std": 0.1480504721403122,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.19218750298023224,
|
|
"rewards/reward_coverage/std": 0.15461647510528564,
|
|
"rewards/reward_repetition/mean": 0.7593749761581421,
|
|
"rewards/reward_repetition/std": 0.12436345219612122,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7619537711143494,
|
|
"sampling/importance_sampling_ratio/min": 3.6812737413604546e-19,
|
|
"sampling/sampling_logp_difference/max": 42.445858001708984,
|
|
"sampling/sampling_logp_difference/mean": 2.5976765155792236,
|
|
"step": 44
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.890625,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.2781213163398206,
|
|
"epoch": 0.4838709677419355,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 1.170350193977356,
|
|
"learning_rate": 3.356164383561644e-06,
|
|
"loss": -0.0036,
|
|
"num_tokens": 14722931.0,
|
|
"reward": 0.9953124523162842,
|
|
"reward_std": 0.12153396755456924,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.1953125,
|
|
"rewards/reward_coverage/std": 0.14412261545658112,
|
|
"rewards/reward_repetition/mean": 0.800000011920929,
|
|
"rewards/reward_repetition/std": 0.08728715777397156,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7608794569969177,
|
|
"sampling/importance_sampling_ratio/min": 3.331248850987206e-13,
|
|
"sampling/sampling_logp_difference/max": 28.73025894165039,
|
|
"sampling/sampling_logp_difference/mean": 2.6023521423339844,
|
|
"step": 45
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 36.0,
|
|
"completions/mean_length": 39.84375,
|
|
"completions/mean_terminated_length": 36.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 36.0,
|
|
"entropy": 0.27238480327650905,
|
|
"epoch": 0.4946236559139785,
|
|
"frac_reward_zero_std": 0.03125,
|
|
"grad_norm": 1.1889655590057373,
|
|
"learning_rate": 3.2876712328767123e-06,
|
|
"loss": -0.005,
|
|
"num_tokens": 15113515.0,
|
|
"reward": 0.9437500238418579,
|
|
"reward_std": 0.17235726118087769,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.19062499701976776,
|
|
"rewards/reward_coverage/std": 0.14333748817443848,
|
|
"rewards/reward_repetition/mean": 0.7531249523162842,
|
|
"rewards/reward_repetition/std": 0.09915315359830856,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7437648177146912,
|
|
"sampling/importance_sampling_ratio/min": 3.385970521172965e-19,
|
|
"sampling/sampling_logp_difference/max": 42.529476165771484,
|
|
"sampling/sampling_logp_difference/mean": 2.778402328491211,
|
|
"step": 46
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 36.0,
|
|
"completions/mean_length": 39.890625,
|
|
"completions/mean_terminated_length": 36.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 36.0,
|
|
"entropy": 0.24875370506197214,
|
|
"epoch": 0.5053763440860215,
|
|
"frac_reward_zero_std": 0.1875,
|
|
"grad_norm": 1.1314892768859863,
|
|
"learning_rate": 3.2191780821917813e-06,
|
|
"loss": -0.0031,
|
|
"num_tokens": 15503912.0,
|
|
"reward": 0.942187488079071,
|
|
"reward_std": 0.15688931941986084,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.18906250596046448,
|
|
"rewards/reward_coverage/std": 0.12230224162340164,
|
|
"rewards/reward_repetition/mean": 0.7531249523162842,
|
|
"rewards/reward_repetition/std": 0.12210943549871445,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7534346580505371,
|
|
"sampling/importance_sampling_ratio/min": 3.298192560128319e-15,
|
|
"sampling/sampling_logp_difference/max": 33.345401763916016,
|
|
"sampling/sampling_logp_difference/mean": 2.7750205993652344,
|
|
"step": 47
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.96875,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 36.0,
|
|
"completions/mean_length": 39.875,
|
|
"completions/mean_terminated_length": 36.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 36.0,
|
|
"entropy": 0.24461835296824574,
|
|
"epoch": 0.5161290322580645,
|
|
"frac_reward_zero_std": 0.0625,
|
|
"grad_norm": 0.6554945111274719,
|
|
"learning_rate": 3.1506849315068495e-06,
|
|
"loss": -0.0048,
|
|
"num_tokens": 15894138.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.16793785989284515,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.21875,
|
|
"rewards/reward_coverage/std": 0.13554710149765015,
|
|
"rewards/reward_repetition/mean": 0.734375,
|
|
"rewards/reward_repetition/std": 0.12372364103794098,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7367614507675171,
|
|
"sampling/importance_sampling_ratio/min": 1.2913128950274503e-18,
|
|
"sampling/sampling_logp_difference/max": 41.19087219238281,
|
|
"sampling/sampling_logp_difference/mean": 2.8969175815582275,
|
|
"step": 48
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.221066806698218,
|
|
"epoch": 0.5268817204301075,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 0.813700795173645,
|
|
"learning_rate": 3.082191780821918e-06,
|
|
"loss": -0.0125,
|
|
"num_tokens": 16284827.0,
|
|
"reward": 0.9203125238418579,
|
|
"reward_std": 0.1657281517982483,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.18906250596046448,
|
|
"rewards/reward_coverage/std": 0.1310727298259735,
|
|
"rewards/reward_repetition/mean": 0.7312500476837158,
|
|
"rewards/reward_repetition/std": 0.12456272542476654,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7454463839530945,
|
|
"sampling/importance_sampling_ratio/min": 2.6984808100466543e-19,
|
|
"sampling/sampling_logp_difference/max": 42.75642776489258,
|
|
"sampling/sampling_logp_difference/mean": 3.0947470664978027,
|
|
"step": 49
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 40.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 40.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.2340390719473362,
|
|
"epoch": 0.5376344086021505,
|
|
"frac_reward_zero_std": 0.1875,
|
|
"grad_norm": 1.1709717512130737,
|
|
"learning_rate": 3.0136986301369864e-06,
|
|
"loss": -0.0098,
|
|
"num_tokens": 16675257.0,
|
|
"reward": 0.9140625596046448,
|
|
"reward_std": 0.13479222357273102,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.18906250596046448,
|
|
"rewards/reward_coverage/std": 0.14155283570289612,
|
|
"rewards/reward_repetition/mean": 0.7250000238418579,
|
|
"rewards/reward_repetition/std": 0.10983392596244812,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.732937216758728,
|
|
"sampling/importance_sampling_ratio/min": 1.6756041496787032e-16,
|
|
"sampling/sampling_logp_difference/max": 36.32518768310547,
|
|
"sampling/sampling_logp_difference/mean": 3.183100700378418,
|
|
"step": 50
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 39.0,
|
|
"completions/mean_length": 39.78125,
|
|
"completions/mean_terminated_length": 39.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 39.0,
|
|
"entropy": 0.20713584939949214,
|
|
"epoch": 0.5483870967741935,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 0.8335843086242676,
|
|
"learning_rate": 2.945205479452055e-06,
|
|
"loss": -0.0031,
|
|
"num_tokens": 17065845.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.15026019513607025,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.21875,
|
|
"rewards/reward_coverage/std": 0.1390158236026764,
|
|
"rewards/reward_repetition/mean": 0.734375,
|
|
"rewards/reward_repetition/std": 0.10722880065441132,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.739495038986206,
|
|
"sampling/importance_sampling_ratio/min": 4.677705163318169e-13,
|
|
"sampling/sampling_logp_difference/max": 28.390798568725586,
|
|
"sampling/sampling_logp_difference/mean": 3.198160409927368,
|
|
"step": 51
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 36.0,
|
|
"completions/mean_length": 39.9375,
|
|
"completions/mean_terminated_length": 36.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 36.0,
|
|
"entropy": 0.19506761734373868,
|
|
"epoch": 0.5591397849462365,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 0.6900437474250793,
|
|
"learning_rate": 2.876712328767123e-06,
|
|
"loss": -0.0002,
|
|
"num_tokens": 17456339.0,
|
|
"reward": 0.9359375238418579,
|
|
"reward_std": 0.17898640036582947,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20468750596046448,
|
|
"rewards/reward_coverage/std": 0.17129728198051453,
|
|
"rewards/reward_repetition/mean": 0.731249988079071,
|
|
"rewards/reward_repetition/std": 0.09574270248413086,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7362239956855774,
|
|
"sampling/importance_sampling_ratio/min": 2.602479520623457e-19,
|
|
"sampling/sampling_logp_difference/max": 42.79265213012695,
|
|
"sampling/sampling_logp_difference/mean": 3.2387518882751465,
|
|
"step": 52
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 40.0,
|
|
"completions/mean_length": 40.0,
|
|
"completions/mean_terminated_length": 40.0,
|
|
"completions/min_length": 40.0,
|
|
"completions/min_terminated_length": 40.0,
|
|
"entropy": 0.1951053044758737,
|
|
"epoch": 0.5698924731182796,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.0901892185211182,
|
|
"learning_rate": 2.8082191780821922e-06,
|
|
"loss": -0.0096,
|
|
"num_tokens": 17846929.0,
|
|
"reward": 0.9390625357627869,
|
|
"reward_std": 0.19224466383457184,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.2109375,
|
|
"rewards/reward_coverage/std": 0.17193317413330078,
|
|
"rewards/reward_repetition/mean": 0.7281249761581421,
|
|
"rewards/reward_repetition/std": 0.10307764261960983,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7550839781761169,
|
|
"sampling/importance_sampling_ratio/min": 1.1955911409525077e-18,
|
|
"sampling/sampling_logp_difference/max": 41.26789093017578,
|
|
"sampling/sampling_logp_difference/mean": 3.1413958072662354,
|
|
"step": 53
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 39.765625,
|
|
"completions/mean_terminated_length": 37.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 37.0,
|
|
"entropy": 0.21364939608611166,
|
|
"epoch": 0.5806451612903226,
|
|
"frac_reward_zero_std": 0.1875,
|
|
"grad_norm": 1.0280330181121826,
|
|
"learning_rate": 2.7397260273972604e-06,
|
|
"loss": -0.0093,
|
|
"num_tokens": 18237620.0,
|
|
"reward": 0.9500000476837158,
|
|
"reward_std": 0.1414213478565216,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.19062501192092896,
|
|
"rewards/reward_coverage/std": 0.1540137678384781,
|
|
"rewards/reward_repetition/mean": 0.7593749761581421,
|
|
"rewards/reward_repetition/std": 0.0885845422744751,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.738981306552887,
|
|
"sampling/importance_sampling_ratio/min": 4.2308257591772147e-13,
|
|
"sampling/sampling_logp_difference/max": 28.491209030151367,
|
|
"sampling/sampling_logp_difference/mean": 3.156113624572754,
|
|
"step": 54
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 36.0,
|
|
"completions/mean_length": 39.890625,
|
|
"completions/mean_terminated_length": 36.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 36.0,
|
|
"entropy": 0.24034091946668923,
|
|
"epoch": 0.5913978494623656,
|
|
"frac_reward_zero_std": 0.1875,
|
|
"grad_norm": 1.18429696559906,
|
|
"learning_rate": 2.671232876712329e-06,
|
|
"loss": -0.0049,
|
|
"num_tokens": 18628129.0,
|
|
"reward": 0.8953125476837158,
|
|
"reward_std": 0.1657281517982483,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.1796875,
|
|
"rewards/reward_coverage/std": 0.1299324631690979,
|
|
"rewards/reward_repetition/mean": 0.715624988079071,
|
|
"rewards/reward_repetition/std": 0.11158134788274765,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7346891760826111,
|
|
"sampling/importance_sampling_ratio/min": 1.511831024952892e-13,
|
|
"sampling/sampling_logp_difference/max": 29.52028465270996,
|
|
"sampling/sampling_logp_difference/mean": 3.2208282947540283,
|
|
"step": 55
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 33.0,
|
|
"completions/mean_length": 39.84375,
|
|
"completions/mean_terminated_length": 33.0,
|
|
"completions/min_length": 33.0,
|
|
"completions/min_terminated_length": 33.0,
|
|
"entropy": 0.23257427848875523,
|
|
"epoch": 0.6021505376344086,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 0.6966381072998047,
|
|
"learning_rate": 2.6027397260273973e-06,
|
|
"loss": -0.0088,
|
|
"num_tokens": 19018723.0,
|
|
"reward": 0.9343750476837158,
|
|
"reward_std": 0.15026018023490906,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.19687500596046448,
|
|
"rewards/reward_coverage/std": 0.1284446120262146,
|
|
"rewards/reward_repetition/mean": 0.737500011920929,
|
|
"rewards/reward_repetition/std": 0.106159508228302,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7426254153251648,
|
|
"sampling/importance_sampling_ratio/min": 7.579855932368998e-14,
|
|
"sampling/sampling_logp_difference/max": 30.210697174072266,
|
|
"sampling/sampling_logp_difference/mean": 3.221386194229126,
|
|
"step": 56
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.2281794489827007,
|
|
"epoch": 0.6129032258064516,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 0.7310183048248291,
|
|
"learning_rate": 2.534246575342466e-06,
|
|
"loss": -0.0083,
|
|
"num_tokens": 19409232.0,
|
|
"reward": 0.9671875238418579,
|
|
"reward_std": 0.14363107085227966,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.22968751192092896,
|
|
"rewards/reward_coverage/std": 0.14979319274425507,
|
|
"rewards/reward_repetition/mean": 0.737500011920929,
|
|
"rewards/reward_repetition/std": 0.10000000149011612,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7459241151809692,
|
|
"sampling/importance_sampling_ratio/min": 1.3522516009469616e-19,
|
|
"sampling/sampling_logp_difference/max": 43.44734573364258,
|
|
"sampling/sampling_logp_difference/mean": 3.226292133331299,
|
|
"step": 57
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.24348380486480892,
|
|
"epoch": 0.6236559139784946,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 0.9412605166435242,
|
|
"learning_rate": 2.4657534246575345e-06,
|
|
"loss": -0.0063,
|
|
"num_tokens": 19799925.0,
|
|
"reward": 0.948437511920929,
|
|
"reward_std": 0.15246990323066711,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20156249403953552,
|
|
"rewards/reward_coverage/std": 0.14637655019760132,
|
|
"rewards/reward_repetition/mean": 0.746874988079071,
|
|
"rewards/reward_repetition/std": 0.10833332687616348,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7302582263946533,
|
|
"sampling/importance_sampling_ratio/min": 1.1974077329752507e-18,
|
|
"sampling/sampling_logp_difference/max": 41.26637268066406,
|
|
"sampling/sampling_logp_difference/mean": 3.3912689685821533,
|
|
"step": 58
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.22413912834599614,
|
|
"epoch": 0.6344086021505376,
|
|
"frac_reward_zero_std": 0.28125,
|
|
"grad_norm": 0.6372097730636597,
|
|
"learning_rate": 2.3972602739726027e-06,
|
|
"loss": -0.0078,
|
|
"num_tokens": 20190710.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.11048543453216553,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.1875,
|
|
"rewards/reward_coverage/std": 0.1278640329837799,
|
|
"rewards/reward_repetition/mean": 0.765625,
|
|
"rewards/reward_repetition/std": 0.08398554474115372,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7377102971076965,
|
|
"sampling/importance_sampling_ratio/min": 5.862870793152246e-21,
|
|
"sampling/sampling_logp_difference/max": 46.58564758300781,
|
|
"sampling/sampling_logp_difference/mean": 3.426024913787842,
|
|
"step": 59
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.90625,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.23593324795365334,
|
|
"epoch": 0.6451612903225806,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 0.7991491556167603,
|
|
"learning_rate": 2.3287671232876713e-06,
|
|
"loss": -0.0066,
|
|
"num_tokens": 20581308.0,
|
|
"reward": 0.9515625238418579,
|
|
"reward_std": 0.13921165466308594,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.18906250596046448,
|
|
"rewards/reward_coverage/std": 0.12487097084522247,
|
|
"rewards/reward_repetition/mean": 0.7625000476837158,
|
|
"rewards/reward_repetition/std": 0.11751393228769302,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7420527935028076,
|
|
"sampling/importance_sampling_ratio/min": 1.627108762957747e-23,
|
|
"sampling/sampling_logp_difference/max": 52.472652435302734,
|
|
"sampling/sampling_logp_difference/mean": 3.326465606689453,
|
|
"step": 60
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.9375,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.25969044235534966,
|
|
"epoch": 0.6559139784946236,
|
|
"frac_reward_zero_std": 0.03125,
|
|
"grad_norm": 0.9682433009147644,
|
|
"learning_rate": 2.26027397260274e-06,
|
|
"loss": -0.0109,
|
|
"num_tokens": 20972092.0,
|
|
"reward": 0.9828125238418579,
|
|
"reward_std": 0.15688931941986084,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20781250298023224,
|
|
"rewards/reward_coverage/std": 0.1336955726146698,
|
|
"rewards/reward_repetition/mean": 0.7749999761581421,
|
|
"rewards/reward_repetition/std": 0.09759000688791275,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7352744936943054,
|
|
"sampling/importance_sampling_ratio/min": 4.3607164320474956e-13,
|
|
"sampling/sampling_logp_difference/max": 28.460969924926758,
|
|
"sampling/sampling_logp_difference/mean": 3.418292999267578,
|
|
"step": 61
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.23248756467364728,
|
|
"epoch": 0.6666666666666666,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 0.9272934198379517,
|
|
"learning_rate": 2.191780821917808e-06,
|
|
"loss": -0.0034,
|
|
"num_tokens": 21362873.0,
|
|
"reward": 0.9390624761581421,
|
|
"reward_std": 0.13921163976192474,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.18593750894069672,
|
|
"rewards/reward_coverage/std": 0.11800886690616608,
|
|
"rewards/reward_repetition/mean": 0.7531249523162842,
|
|
"rewards/reward_repetition/std": 0.1053621917963028,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7381278276443481,
|
|
"sampling/importance_sampling_ratio/min": 5.011335584784865e-14,
|
|
"sampling/sampling_logp_difference/max": 30.624488830566406,
|
|
"sampling/sampling_logp_difference/mean": 3.4408388137817383,
|
|
"step": 62
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 39.890625,
|
|
"completions/mean_terminated_length": 37.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 37.0,
|
|
"entropy": 0.2272115428932011,
|
|
"epoch": 0.6774193548387096,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 0.6223781704902649,
|
|
"learning_rate": 2.123287671232877e-06,
|
|
"loss": -0.0014,
|
|
"num_tokens": 21753644.0,
|
|
"reward": 0.9937499761581421,
|
|
"reward_std": 0.16793787479400635,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20625001192092896,
|
|
"rewards/reward_coverage/std": 0.15210169553756714,
|
|
"rewards/reward_repetition/mean": 0.7875000238418579,
|
|
"rewards/reward_repetition/std": 0.08637312799692154,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7413582801818848,
|
|
"sampling/importance_sampling_ratio/min": 8.111444921513807e-17,
|
|
"sampling/sampling_logp_difference/max": 37.0506706237793,
|
|
"sampling/sampling_logp_difference/mean": 3.607243776321411,
|
|
"step": 63
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.2311963385436684,
|
|
"epoch": 0.6881720430107527,
|
|
"frac_reward_zero_std": 0.1875,
|
|
"grad_norm": 0.8859496116638184,
|
|
"learning_rate": 2.0547945205479454e-06,
|
|
"loss": 0.0046,
|
|
"num_tokens": 22144251.0,
|
|
"reward": 1.0171875953674316,
|
|
"reward_std": 0.15688931941986084,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.2421875,
|
|
"rewards/reward_coverage/std": 0.16407963633537292,
|
|
"rewards/reward_repetition/mean": 0.7749999761581421,
|
|
"rewards/reward_repetition/std": 0.09085135161876678,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7465033531188965,
|
|
"sampling/importance_sampling_ratio/min": 4.7572778301925226e-18,
|
|
"sampling/sampling_logp_difference/max": 39.88685607910156,
|
|
"sampling/sampling_logp_difference/mean": 3.552140951156616,
|
|
"step": 64
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.2426956002600491,
|
|
"epoch": 0.6989247311827957,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 1.269731044769287,
|
|
"learning_rate": 1.9863013698630136e-06,
|
|
"loss": -0.0092,
|
|
"num_tokens": 22535042.0,
|
|
"reward": 0.9437500238418579,
|
|
"reward_std": 0.16793784499168396,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.16562500596046448,
|
|
"rewards/reward_coverage/std": 0.1382644772529602,
|
|
"rewards/reward_repetition/mean": 0.778124988079071,
|
|
"rewards/reward_repetition/std": 0.10759823769330978,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.745509684085846,
|
|
"sampling/importance_sampling_ratio/min": 2.5291580594867795e-16,
|
|
"sampling/sampling_logp_difference/max": 35.913475036621094,
|
|
"sampling/sampling_logp_difference/mean": 3.580662488937378,
|
|
"step": 65
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 40.0,
|
|
"completions/mean_length": 39.90625,
|
|
"completions/mean_terminated_length": 40.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 40.0,
|
|
"entropy": 0.25972409872338176,
|
|
"epoch": 0.7096774193548387,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 1.2360777854919434,
|
|
"learning_rate": 1.9178082191780823e-06,
|
|
"loss": -0.0021,
|
|
"num_tokens": 22925826.0,
|
|
"reward": 0.989062488079071,
|
|
"reward_std": 0.13921163976192474,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20156249403953552,
|
|
"rewards/reward_coverage/std": 0.1578548550605774,
|
|
"rewards/reward_repetition/mean": 0.7875000238418579,
|
|
"rewards/reward_repetition/std": 0.07867958396673203,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.742576003074646,
|
|
"sampling/importance_sampling_ratio/min": 3.8009925881678924e-15,
|
|
"sampling/sampling_logp_difference/max": 33.203514099121094,
|
|
"sampling/sampling_logp_difference/mean": 3.5737202167510986,
|
|
"step": 66
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.9375,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.24233098467811942,
|
|
"epoch": 0.7204301075268817,
|
|
"frac_reward_zero_std": 0.21875,
|
|
"grad_norm": 1.0064667463302612,
|
|
"learning_rate": 1.8493150684931507e-06,
|
|
"loss": -0.0074,
|
|
"num_tokens": 23316620.0,
|
|
"reward": 0.9796874523162842,
|
|
"reward_std": 0.16130872070789337,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.18281251192092896,
|
|
"rewards/reward_coverage/std": 0.13280214369297028,
|
|
"rewards/reward_repetition/mean": 0.796875,
|
|
"rewards/reward_repetition/std": 0.11542708426713943,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7610887885093689,
|
|
"sampling/importance_sampling_ratio/min": 3.512002747491507e-17,
|
|
"sampling/sampling_logp_difference/max": 37.887760162353516,
|
|
"sampling/sampling_logp_difference/mean": 3.572136878967285,
|
|
"step": 67
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.26319174305535853,
|
|
"epoch": 0.7311827956989247,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.0560470819473267,
|
|
"learning_rate": 1.7808219178082193e-06,
|
|
"loss": -0.012,
|
|
"num_tokens": 23707121.0,
|
|
"reward": 0.9937500953674316,
|
|
"reward_std": 0.16351842880249023,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.21562500298023224,
|
|
"rewards/reward_coverage/std": 0.13359349966049194,
|
|
"rewards/reward_repetition/mean": 0.778124988079071,
|
|
"rewards/reward_repetition/std": 0.12404395639896393,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7442783713340759,
|
|
"sampling/importance_sampling_ratio/min": 1.375699243920652e-15,
|
|
"sampling/sampling_logp_difference/max": 34.21981430053711,
|
|
"sampling/sampling_logp_difference/mean": 3.5864293575286865,
|
|
"step": 68
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.96875,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 36.0,
|
|
"completions/mean_length": 39.78125,
|
|
"completions/mean_terminated_length": 36.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 36.0,
|
|
"entropy": 0.27109498833306134,
|
|
"epoch": 0.7419354838709677,
|
|
"frac_reward_zero_std": 0.21875,
|
|
"grad_norm": 1.1270716190338135,
|
|
"learning_rate": 1.7123287671232877e-06,
|
|
"loss": -0.0065,
|
|
"num_tokens": 24097617.0,
|
|
"reward": 0.9968750476837158,
|
|
"reward_std": 0.1237436830997467,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20937500894069672,
|
|
"rewards/reward_coverage/std": 0.1376892626285553,
|
|
"rewards/reward_repetition/mean": 0.7875000238418579,
|
|
"rewards/reward_repetition/std": 0.09343531727790833,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7473452687263489,
|
|
"sampling/importance_sampling_ratio/min": 1.4495503789956396e-16,
|
|
"sampling/sampling_logp_difference/max": 36.47010803222656,
|
|
"sampling/sampling_logp_difference/mean": 3.4904568195343018,
|
|
"step": 69
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.90625,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.26768106454983354,
|
|
"epoch": 0.7526881720430108,
|
|
"frac_reward_zero_std": 0.21875,
|
|
"grad_norm": 1.1696867942810059,
|
|
"learning_rate": 1.6438356164383561e-06,
|
|
"loss": -0.0057,
|
|
"num_tokens": 24488387.0,
|
|
"reward": 1.0328125953674316,
|
|
"reward_std": 0.13921163976192474,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.22031250596046448,
|
|
"rewards/reward_coverage/std": 0.13590343296527863,
|
|
"rewards/reward_repetition/mean": 0.8125,
|
|
"rewards/reward_repetition/std": 0.11198072135448456,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7499436736106873,
|
|
"sampling/importance_sampling_ratio/min": 1.4622693992618306e-15,
|
|
"sampling/sampling_logp_difference/max": 34.15878677368164,
|
|
"sampling/sampling_logp_difference/mean": 3.6728289127349854,
|
|
"step": 70
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.24969360628165305,
|
|
"epoch": 0.7634408602150538,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 0.8783805966377258,
|
|
"learning_rate": 1.5753424657534248e-06,
|
|
"loss": -0.0031,
|
|
"num_tokens": 24879084.0,
|
|
"reward": 1.0343749523162842,
|
|
"reward_std": 0.1590990126132965,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.19062501192092896,
|
|
"rewards/reward_coverage/std": 0.13179922103881836,
|
|
"rewards/reward_repetition/mean": 0.8437500596046448,
|
|
"rewards/reward_repetition/std": 0.10965313017368317,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7588081955909729,
|
|
"sampling/importance_sampling_ratio/min": 3.0506860601055286e-14,
|
|
"sampling/sampling_logp_difference/max": 31.120824813842773,
|
|
"sampling/sampling_logp_difference/mean": 3.584549903869629,
|
|
"step": 71
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.796875,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.2575332070700824,
|
|
"epoch": 0.7741935483870968,
|
|
"frac_reward_zero_std": 0.21875,
|
|
"grad_norm": 0.6952300071716309,
|
|
"learning_rate": 1.5068493150684932e-06,
|
|
"loss": -0.0028,
|
|
"num_tokens": 25269863.0,
|
|
"reward": 1.029687523841858,
|
|
"reward_std": 0.14363105595111847,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.1953125,
|
|
"rewards/reward_coverage/std": 0.1396477371454239,
|
|
"rewards/reward_repetition/mean": 0.8343750238418579,
|
|
"rewards/reward_repetition/std": 0.11014961451292038,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.767298698425293,
|
|
"sampling/importance_sampling_ratio/min": 1.6540065300593926e-15,
|
|
"sampling/sampling_logp_difference/max": 34.03557586669922,
|
|
"sampling/sampling_logp_difference/mean": 3.5412185192108154,
|
|
"step": 72
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.796875,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.27034957450814545,
|
|
"epoch": 0.7849462365591398,
|
|
"frac_reward_zero_std": 0.28125,
|
|
"grad_norm": 1.3305058479309082,
|
|
"learning_rate": 1.4383561643835616e-06,
|
|
"loss": -0.0086,
|
|
"num_tokens": 25660624.0,
|
|
"reward": 0.9765625,
|
|
"reward_std": 0.1303728073835373,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.19218750298023224,
|
|
"rewards/reward_coverage/std": 0.11724982410669327,
|
|
"rewards/reward_repetition/mean": 0.784375011920929,
|
|
"rewards/reward_repetition/std": 0.10269193351268768,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7455124258995056,
|
|
"sampling/importance_sampling_ratio/min": 4.9776697520764746e-14,
|
|
"sampling/sampling_logp_difference/max": 30.631229400634766,
|
|
"sampling/sampling_logp_difference/mean": 3.557706594467163,
|
|
"step": 73
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 40.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 40.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 40.0,
|
|
"entropy": 0.2672195213381201,
|
|
"epoch": 0.7956989247311828,
|
|
"frac_reward_zero_std": 0.09375,
|
|
"grad_norm": 1.368238925933838,
|
|
"learning_rate": 1.3698630136986302e-06,
|
|
"loss": -0.0091,
|
|
"num_tokens": 26051389.0,
|
|
"reward": 1.0171875953674316,
|
|
"reward_std": 0.15246988832950592,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.1953125,
|
|
"rewards/reward_coverage/std": 0.1361951231956482,
|
|
"rewards/reward_repetition/mean": 0.8218749761581421,
|
|
"rewards/reward_repetition/std": 0.10759823024272919,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7550181746482849,
|
|
"sampling/importance_sampling_ratio/min": 4.837896576403988e-13,
|
|
"sampling/sampling_logp_difference/max": 28.357126235961914,
|
|
"sampling/sampling_logp_difference/mean": 3.518388271331787,
|
|
"step": 74
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.2512192933354527,
|
|
"epoch": 0.8064516129032258,
|
|
"frac_reward_zero_std": 0.1875,
|
|
"grad_norm": 0.8075931072235107,
|
|
"learning_rate": 1.3013698630136986e-06,
|
|
"loss": -0.0055,
|
|
"num_tokens": 26442164.0,
|
|
"reward": 0.9906250238418579,
|
|
"reward_std": 0.12816309928894043,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.16562500596046448,
|
|
"rewards/reward_coverage/std": 0.12626346945762634,
|
|
"rewards/reward_repetition/mean": 0.8250000476837158,
|
|
"rewards/reward_repetition/std": 0.09085134416818619,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7599254250526428,
|
|
"sampling/importance_sampling_ratio/min": 1.1269083539586222e-12,
|
|
"sampling/sampling_logp_difference/max": 27.51154327392578,
|
|
"sampling/sampling_logp_difference/mean": 3.5641071796417236,
|
|
"step": 75
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.90625,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.25094706076197326,
|
|
"epoch": 0.8172043010752689,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 0.8058338165283203,
|
|
"learning_rate": 1.2328767123287673e-06,
|
|
"loss": -0.008,
|
|
"num_tokens": 26832860.0,
|
|
"reward": 1.0093750953674316,
|
|
"reward_std": 0.1767766773700714,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.19374999403953552,
|
|
"rewards/reward_coverage/std": 0.16122055053710938,
|
|
"rewards/reward_repetition/mean": 0.815625011920929,
|
|
"rewards/reward_repetition/std": 0.10269193351268768,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7578780055046082,
|
|
"sampling/importance_sampling_ratio/min": 1.1848823085411635e-15,
|
|
"sampling/sampling_logp_difference/max": 34.36913299560547,
|
|
"sampling/sampling_logp_difference/mean": 3.489642858505249,
|
|
"step": 76
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.90625,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.26383460965007544,
|
|
"epoch": 0.8279569892473119,
|
|
"frac_reward_zero_std": 0.1875,
|
|
"grad_norm": 0.9045315384864807,
|
|
"learning_rate": 1.1643835616438357e-06,
|
|
"loss": -0.0039,
|
|
"num_tokens": 27223636.0,
|
|
"reward": 1.032812476158142,
|
|
"reward_std": 0.14363105595111847,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20156249403953552,
|
|
"rewards/reward_coverage/std": 0.12407395988702774,
|
|
"rewards/reward_repetition/mean": 0.831250011920929,
|
|
"rewards/reward_repetition/std": 0.12456272542476654,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7593458890914917,
|
|
"sampling/importance_sampling_ratio/min": 1.2729455923859382e-14,
|
|
"sampling/sampling_logp_difference/max": 31.994857788085938,
|
|
"sampling/sampling_logp_difference/mean": 3.625725030899048,
|
|
"step": 77
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.859375,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.2579868610482663,
|
|
"epoch": 0.8387096774193549,
|
|
"frac_reward_zero_std": 0.09375,
|
|
"grad_norm": 1.0903428792953491,
|
|
"learning_rate": 1.095890410958904e-06,
|
|
"loss": -0.0077,
|
|
"num_tokens": 27614409.0,
|
|
"reward": 1.053125023841858,
|
|
"reward_std": 0.18119609355926514,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.22187501192092896,
|
|
"rewards/reward_coverage/std": 0.15682236850261688,
|
|
"rewards/reward_repetition/mean": 0.8312499523162842,
|
|
"rewards/reward_repetition/std": 0.12456272542476654,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7564480304718018,
|
|
"sampling/importance_sampling_ratio/min": 3.942305127637401e-14,
|
|
"sampling/sampling_logp_difference/max": 30.864425659179688,
|
|
"sampling/sampling_logp_difference/mean": 3.604686975479126,
|
|
"step": 78
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.24791082250885665,
|
|
"epoch": 0.8494623655913979,
|
|
"frac_reward_zero_std": 0.09375,
|
|
"grad_norm": 1.090151071548462,
|
|
"learning_rate": 1.0273972602739727e-06,
|
|
"loss": -0.0028,
|
|
"num_tokens": 28005198.0,
|
|
"reward": 1.029687523841858,
|
|
"reward_std": 0.1480504870414734,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.19218750298023224,
|
|
"rewards/reward_coverage/std": 0.16837665438652039,
|
|
"rewards/reward_repetition/mean": 0.8375000357627869,
|
|
"rewards/reward_repetition/std": 0.106159508228302,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.771141767501831,
|
|
"sampling/importance_sampling_ratio/min": 3.6129458528665753e-14,
|
|
"sampling/sampling_logp_difference/max": 30.95166778564453,
|
|
"sampling/sampling_logp_difference/mean": 3.4720849990844727,
|
|
"step": 79
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.859375,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.25930464873090386,
|
|
"epoch": 0.8602150537634409,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.083723545074463,
|
|
"learning_rate": 9.589041095890411e-07,
|
|
"loss": -0.0051,
|
|
"num_tokens": 28395507.0,
|
|
"reward": 1.046875,
|
|
"reward_std": 0.15026018023490906,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.23125000298023224,
|
|
"rewards/reward_coverage/std": 0.14015299081802368,
|
|
"rewards/reward_repetition/mean": 0.815625011920929,
|
|
"rewards/reward_repetition/std": 0.152459979057312,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7536813020706177,
|
|
"sampling/importance_sampling_ratio/min": 9.19962348487624e-14,
|
|
"sampling/sampling_logp_difference/max": 30.01702880859375,
|
|
"sampling/sampling_logp_difference/mean": 3.546005964279175,
|
|
"step": 80
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 40.0,
|
|
"completions/mean_length": 40.0,
|
|
"completions/mean_terminated_length": 40.0,
|
|
"completions/min_length": 40.0,
|
|
"completions/min_terminated_length": 40.0,
|
|
"entropy": 0.24061511480249465,
|
|
"epoch": 0.8709677419354839,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 0.8584389686584473,
|
|
"learning_rate": 8.904109589041097e-07,
|
|
"loss": -0.0066,
|
|
"num_tokens": 28786289.0,
|
|
"reward": 1.0609374046325684,
|
|
"reward_std": 0.1303727924823761,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.2109375,
|
|
"rewards/reward_coverage/std": 0.15130877494812012,
|
|
"rewards/reward_repetition/mean": 0.8500000238418579,
|
|
"rewards/reward_repetition/std": 0.10690449178218842,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7686938643455505,
|
|
"sampling/importance_sampling_ratio/min": 2.277888706651854e-13,
|
|
"sampling/sampling_logp_difference/max": 29.1103572845459,
|
|
"sampling/sampling_logp_difference/mean": 3.516740560531616,
|
|
"step": 81
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 36.0,
|
|
"completions/mean_length": 39.84375,
|
|
"completions/mean_terminated_length": 36.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 36.0,
|
|
"entropy": 0.23985581938177347,
|
|
"epoch": 0.8817204301075269,
|
|
"frac_reward_zero_std": 0.21875,
|
|
"grad_norm": 0.9327372312545776,
|
|
"learning_rate": 8.219178082191781e-07,
|
|
"loss": -0.0095,
|
|
"num_tokens": 29177061.0,
|
|
"reward": 1.0640625953674316,
|
|
"reward_std": 0.15688930451869965,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.21718749403953552,
|
|
"rewards/reward_coverage/std": 0.13634072244167328,
|
|
"rewards/reward_repetition/mean": 0.846875011920929,
|
|
"rewards/reward_repetition/std": 0.12210942804813385,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7691041827201843,
|
|
"sampling/importance_sampling_ratio/min": 9.13759844699269e-13,
|
|
"sampling/sampling_logp_difference/max": 27.721208572387695,
|
|
"sampling/sampling_logp_difference/mean": 3.6279892921447754,
|
|
"step": 82
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 33.0,
|
|
"completions/mean_length": 39.609375,
|
|
"completions/mean_terminated_length": 33.0,
|
|
"completions/min_length": 33.0,
|
|
"completions/min_terminated_length": 33.0,
|
|
"entropy": 0.2540010770317167,
|
|
"epoch": 0.8924731182795699,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7506739497184753,
|
|
"learning_rate": 7.534246575342466e-07,
|
|
"loss": -0.0126,
|
|
"num_tokens": 29567814.0,
|
|
"reward": 1.0328125953674316,
|
|
"reward_std": 0.1834058165550232,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20156249403953552,
|
|
"rewards/reward_coverage/std": 0.13857802748680115,
|
|
"rewards/reward_repetition/mean": 0.831250011920929,
|
|
"rewards/reward_repetition/std": 0.12456272542476654,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7523236274719238,
|
|
"sampling/importance_sampling_ratio/min": 5.4132771128059115e-14,
|
|
"sampling/sampling_logp_difference/max": 30.54733657836914,
|
|
"sampling/sampling_logp_difference/mean": 3.681413173675537,
|
|
"step": 83
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.875,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.25169974751770496,
|
|
"epoch": 0.9032258064516129,
|
|
"frac_reward_zero_std": 0.1875,
|
|
"grad_norm": 0.8115216493606567,
|
|
"learning_rate": 6.849315068493151e-07,
|
|
"loss": -0.0139,
|
|
"num_tokens": 29958486.0,
|
|
"reward": 1.0359375476837158,
|
|
"reward_std": 0.13479222357273102,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.18593750894069672,
|
|
"rewards/reward_coverage/std": 0.14014413952827454,
|
|
"rewards/reward_repetition/mean": 0.8500000238418579,
|
|
"rewards/reward_repetition/std": 0.10690449178218842,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.755419909954071,
|
|
"sampling/importance_sampling_ratio/min": 5.566194003652804e-14,
|
|
"sampling/sampling_logp_difference/max": 30.519479751586914,
|
|
"sampling/sampling_logp_difference/mean": 3.5999581813812256,
|
|
"step": 84
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.2387481287587434,
|
|
"epoch": 0.9139784946236559,
|
|
"frac_reward_zero_std": 0.1875,
|
|
"grad_norm": 0.7745798826217651,
|
|
"learning_rate": 6.164383561643836e-07,
|
|
"loss": -0.0112,
|
|
"num_tokens": 30349169.0,
|
|
"reward": 1.0515624284744263,
|
|
"reward_std": 0.1834058165550232,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20156250894069672,
|
|
"rewards/reward_coverage/std": 0.15274441242218018,
|
|
"rewards/reward_repetition/mean": 0.8500000238418579,
|
|
"rewards/reward_repetition/std": 0.11818736046552658,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.76320481300354,
|
|
"sampling/importance_sampling_ratio/min": 1.7256138737983123e-13,
|
|
"sampling/sampling_logp_difference/max": 29.388023376464844,
|
|
"sampling/sampling_logp_difference/mean": 3.6501305103302,
|
|
"step": 85
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.765625,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.2610483162570745,
|
|
"epoch": 0.9247311827956989,
|
|
"frac_reward_zero_std": 0.21875,
|
|
"grad_norm": 1.2502846717834473,
|
|
"learning_rate": 5.47945205479452e-07,
|
|
"loss": -0.0119,
|
|
"num_tokens": 30739838.0,
|
|
"reward": 1.0515625476837158,
|
|
"reward_std": 0.12595339119434357,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20781250298023224,
|
|
"rewards/reward_coverage/std": 0.12885908782482147,
|
|
"rewards/reward_repetition/mean": 0.84375,
|
|
"rewards/reward_repetition/std": 0.11529809236526489,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7513002753257751,
|
|
"sampling/importance_sampling_ratio/min": 2.8611465139379705e-14,
|
|
"sampling/sampling_logp_difference/max": 31.184968948364258,
|
|
"sampling/sampling_logp_difference/mean": 3.6847875118255615,
|
|
"step": 86
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 40.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 40.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.23906523222103715,
|
|
"epoch": 0.9354838709677419,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 0.7462826371192932,
|
|
"learning_rate": 4.794520547945206e-07,
|
|
"loss": -0.0061,
|
|
"num_tokens": 31130530.0,
|
|
"reward": 1.045312523841858,
|
|
"reward_std": 0.1303728073835373,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20156250894069672,
|
|
"rewards/reward_coverage/std": 0.13391800224781036,
|
|
"rewards/reward_repetition/mean": 0.84375,
|
|
"rewards/reward_repetition/std": 0.10965313017368317,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7613617777824402,
|
|
"sampling/importance_sampling_ratio/min": 9.466830531296155e-13,
|
|
"sampling/sampling_logp_difference/max": 27.68581199645996,
|
|
"sampling/sampling_logp_difference/mean": 3.6363892555236816,
|
|
"step": 87
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.2408477501012385,
|
|
"epoch": 0.946236559139785,
|
|
"frac_reward_zero_std": 0.125,
|
|
"grad_norm": 1.0979928970336914,
|
|
"learning_rate": 4.1095890410958903e-07,
|
|
"loss": -0.0084,
|
|
"num_tokens": 31521313.0,
|
|
"reward": 1.0375001430511475,
|
|
"reward_std": 0.1590990126132965,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.19062499701976776,
|
|
"rewards/reward_coverage/std": 0.14662203192710876,
|
|
"rewards/reward_repetition/mean": 0.846875011920929,
|
|
"rewards/reward_repetition/std": 0.09915315359830856,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7597459554672241,
|
|
"sampling/importance_sampling_ratio/min": 1.093270764716825e-12,
|
|
"sampling/sampling_logp_difference/max": 27.541847229003906,
|
|
"sampling/sampling_logp_difference/mean": 3.6034116744995117,
|
|
"step": 88
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.984375,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 37.0,
|
|
"completions/mean_length": 39.859375,
|
|
"completions/mean_terminated_length": 37.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 37.0,
|
|
"entropy": 0.254688891582191,
|
|
"epoch": 0.956989247311828,
|
|
"frac_reward_zero_std": 0.09375,
|
|
"grad_norm": 0.8742080926895142,
|
|
"learning_rate": 3.4246575342465755e-07,
|
|
"loss": -0.0096,
|
|
"num_tokens": 31911900.0,
|
|
"reward": 1.0859375,
|
|
"reward_std": 0.183405801653862,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.2515625059604645,
|
|
"rewards/reward_coverage/std": 0.16522441804409027,
|
|
"rewards/reward_repetition/mean": 0.8343749642372131,
|
|
"rewards/reward_repetition/std": 0.13119566440582275,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7481080889701843,
|
|
"sampling/importance_sampling_ratio/min": 1.0778268598939867e-16,
|
|
"sampling/sampling_logp_difference/max": 36.766414642333984,
|
|
"sampling/sampling_logp_difference/mean": 3.735830307006836,
|
|
"step": 89
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.24119682773016393,
|
|
"epoch": 0.967741935483871,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 0.9537080526351929,
|
|
"learning_rate": 2.73972602739726e-07,
|
|
"loss": -0.0102,
|
|
"num_tokens": 32302677.0,
|
|
"reward": 1.078125,
|
|
"reward_std": 0.15026018023490906,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.22812499105930328,
|
|
"rewards/reward_coverage/std": 0.15783129632472992,
|
|
"rewards/reward_repetition/mean": 0.8500000238418579,
|
|
"rewards/reward_repetition/std": 0.11268723756074905,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7631887793540955,
|
|
"sampling/importance_sampling_ratio/min": 4.8462983300891216e-14,
|
|
"sampling/sampling_logp_difference/max": 30.657976150512695,
|
|
"sampling/sampling_logp_difference/mean": 3.6851108074188232,
|
|
"step": 90
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 40.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 40.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.24475648440420628,
|
|
"epoch": 0.978494623655914,
|
|
"frac_reward_zero_std": 0.0625,
|
|
"grad_norm": 0.890794038772583,
|
|
"learning_rate": 2.0547945205479452e-07,
|
|
"loss": -0.004,
|
|
"num_tokens": 32693369.0,
|
|
"reward": 1.0171875953674316,
|
|
"reward_std": 0.14363105595111847,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.17343750596046448,
|
|
"rewards/reward_coverage/std": 0.13362134993076324,
|
|
"rewards/reward_repetition/mean": 0.84375,
|
|
"rewards/reward_repetition/std": 0.11529809236526489,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7631818652153015,
|
|
"sampling/importance_sampling_ratio/min": 3.372482685910089e-14,
|
|
"sampling/sampling_logp_difference/max": 31.02054214477539,
|
|
"sampling/sampling_logp_difference/mean": 3.65973162651062,
|
|
"step": 91
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 40.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 40.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.2510380311869085,
|
|
"epoch": 0.989247311827957,
|
|
"frac_reward_zero_std": 0.09375,
|
|
"grad_norm": 0.9964897632598877,
|
|
"learning_rate": 1.36986301369863e-07,
|
|
"loss": -0.0126,
|
|
"num_tokens": 33084143.0,
|
|
"reward": 1.0359375476837158,
|
|
"reward_std": 0.17898640036582947,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.1953125,
|
|
"rewards/reward_coverage/std": 0.1385064274072647,
|
|
"rewards/reward_repetition/mean": 0.8406250476837158,
|
|
"rewards/reward_repetition/std": 0.134186252951622,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.753570556640625,
|
|
"sampling/importance_sampling_ratio/min": 6.38058653628466e-15,
|
|
"sampling/sampling_logp_difference/max": 32.685516357421875,
|
|
"sampling/sampling_logp_difference/mean": 3.6861093044281006,
|
|
"step": 92
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 40.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 39.953125,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"entropy": 0.2551991257350892,
|
|
"epoch": 1.0,
|
|
"frac_reward_zero_std": 0.15625,
|
|
"grad_norm": 1.1012099981307983,
|
|
"learning_rate": 6.84931506849315e-08,
|
|
"loss": -0.0126,
|
|
"num_tokens": 33474818.0,
|
|
"reward": 1.0421874523162842,
|
|
"reward_std": 0.15246988832950592,
|
|
"rewards/reward_correct/mean": 0.0,
|
|
"rewards/reward_correct/std": 0.0,
|
|
"rewards/reward_coverage/mean": 0.20156250894069672,
|
|
"rewards/reward_coverage/std": 0.1290898472070694,
|
|
"rewards/reward_repetition/mean": 0.8406250476837158,
|
|
"rewards/reward_repetition/std": 0.1293681114912033,
|
|
"sampling/importance_sampling_ratio/max": 2.0,
|
|
"sampling/importance_sampling_ratio/mean": 0.7444443702697754,
|
|
"sampling/importance_sampling_ratio/min": 2.801043902031508e-13,
|
|
"sampling/sampling_logp_difference/max": 28.903614044189453,
|
|
"sampling/sampling_logp_difference/mean": 3.683718204498291,
|
|
"step": 93
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 93,
|
|
"num_input_tokens_seen": 33474818,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 10,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|