1955 lines
79 KiB
JSON
1955 lines
79 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.00075,
|
|
"eval_steps": 500,
|
|
"global_step": 75,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10096.0,
|
|
"completions/max_terminated_length": 10096.0,
|
|
"completions/mean_length": 8672.71875,
|
|
"completions/mean_terminated_length": 8672.71875,
|
|
"completions/min_length": 3020.0,
|
|
"completions/min_terminated_length": 3020.0,
|
|
"entropy": 0.49113161116838455,
|
|
"epoch": 1e-05,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.241949200630188,
|
|
"kl": 0.0,
|
|
"learning_rate": 0.0,
|
|
"loss": -0.0633,
|
|
"num_tokens": 306152.0,
|
|
"reward": -0.4408680200576782,
|
|
"reward_std": 0.3989785313606262,
|
|
"rewards/rollout_eval_reward_func/mean": 0.11064532399177551,
|
|
"rewards/rollout_eval_reward_func/std": 0.21571724116802216,
|
|
"rewards/rollout_reward_func/mean": -0.4408680200576782,
|
|
"rewards/rollout_reward_func/std": 0.44763946533203125,
|
|
"sampling/importance_sampling_ratio/max": 1.2819759845733643,
|
|
"sampling/importance_sampling_ratio/mean": 0.9992397427558899,
|
|
"sampling/importance_sampling_ratio/min": 0.7715137004852295,
|
|
"sampling/sampling_logp_difference/max": 0.2594008445739746,
|
|
"sampling/sampling_logp_difference/mean": 0.01546277105808258,
|
|
"step": 1,
|
|
"step_time": 73.26994180099973
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"entropy": 0.49113161116838455,
|
|
"epoch": 2e-05,
|
|
"grad_norm": 1.2400784492492676,
|
|
"kl": 0.0,
|
|
"learning_rate": 2.8571428571428573e-06,
|
|
"loss": -0.0633,
|
|
"step": 2,
|
|
"step_time": 30.109230951999052
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0005208333604969084,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0005208333604969084,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10009.0,
|
|
"completions/max_terminated_length": 10009.0,
|
|
"completions/mean_length": 7330.1875,
|
|
"completions/mean_terminated_length": 7330.1875,
|
|
"completions/min_length": 346.0,
|
|
"completions/min_terminated_length": 346.0,
|
|
"entropy": 0.5131296459585428,
|
|
"epoch": 3e-05,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.102152943611145,
|
|
"kl": 0.0009028113518070313,
|
|
"learning_rate": 5.7142857142857145e-06,
|
|
"loss": -0.2347,
|
|
"num_tokens": 569740.0,
|
|
"reward": -0.48799318075180054,
|
|
"reward_std": 0.5598607063293457,
|
|
"rewards/rollout_eval_reward_func/mean": 0.22929370403289795,
|
|
"rewards/rollout_eval_reward_func/std": 0.26715749502182007,
|
|
"rewards/rollout_reward_func/mean": -0.48799318075180054,
|
|
"rewards/rollout_reward_func/std": 0.5559459924697876,
|
|
"sampling/importance_sampling_ratio/max": 1.2627520561218262,
|
|
"sampling/importance_sampling_ratio/mean": 1.0006182193756104,
|
|
"sampling/importance_sampling_ratio/min": 0.7627776861190796,
|
|
"sampling/sampling_logp_difference/max": 0.27078866958618164,
|
|
"sampling/sampling_logp_difference/mean": 0.014230873435735703,
|
|
"step": 3,
|
|
"step_time": 68.85090976999709
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0020833334419876337,
|
|
"clip_ratio/high_mean": 0.0010416667209938169,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0010416667209938169,
|
|
"entropy": 0.5151741988956928,
|
|
"epoch": 4e-05,
|
|
"grad_norm": 1.0848904848098755,
|
|
"kl": 0.0004950130587531021,
|
|
"learning_rate": 8.571428571428573e-06,
|
|
"loss": -0.2336,
|
|
"step": 4,
|
|
"step_time": 28.428488818004553
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0010416667209938169,
|
|
"clip_ratio/high_mean": 0.0005208333604969084,
|
|
"clip_ratio/low_mean": 0.0005208333604969084,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0010416667209938169,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10323.0,
|
|
"completions/max_terminated_length": 10323.0,
|
|
"completions/mean_length": 8267.125,
|
|
"completions/mean_terminated_length": 8267.125,
|
|
"completions/min_length": 1640.0,
|
|
"completions/min_terminated_length": 1640.0,
|
|
"entropy": 0.5123504158109426,
|
|
"epoch": 5e-05,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.1476984024047852,
|
|
"kl": 0.0007431179510604125,
|
|
"learning_rate": 1.1428571428571429e-05,
|
|
"loss": -0.0418,
|
|
"num_tokens": 862728.0,
|
|
"reward": -0.46075016260147095,
|
|
"reward_std": 0.5065791606903076,
|
|
"rewards/rollout_eval_reward_func/mean": 0.128683939576149,
|
|
"rewards/rollout_eval_reward_func/std": 0.2396152913570404,
|
|
"rewards/rollout_reward_func/mean": -0.46075016260147095,
|
|
"rewards/rollout_reward_func/std": 0.5104123950004578,
|
|
"sampling/importance_sampling_ratio/max": 1.3248213529586792,
|
|
"sampling/importance_sampling_ratio/mean": 1.0001360177993774,
|
|
"sampling/importance_sampling_ratio/min": 0.6914317011833191,
|
|
"sampling/sampling_logp_difference/max": 0.3689908981323242,
|
|
"sampling/sampling_logp_difference/mean": 0.016226449981331825,
|
|
"step": 5,
|
|
"step_time": 75.37122915000327
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0026041667442768812,
|
|
"clip_ratio/high_mean": 0.0013020833721384406,
|
|
"clip_ratio/low_mean": 0.0032900729565881193,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.00459215632872656,
|
|
"entropy": 0.5106779877096415,
|
|
"epoch": 6e-05,
|
|
"grad_norm": 1.0145094394683838,
|
|
"kl": 0.0013804795053147245,
|
|
"learning_rate": 1.4285714285714285e-05,
|
|
"loss": -0.045,
|
|
"step": 6,
|
|
"step_time": 29.551835642994774
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0024003623984754086,
|
|
"clip_ratio/high_mean": 0.0012001811992377043,
|
|
"clip_ratio/low_mean": 0.0005208333604969084,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0017210145597346127,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10088.0,
|
|
"completions/max_terminated_length": 10088.0,
|
|
"completions/mean_length": 8518.21875,
|
|
"completions/mean_terminated_length": 8518.21875,
|
|
"completions/min_length": 4084.0,
|
|
"completions/min_terminated_length": 4084.0,
|
|
"entropy": 0.5038529355078936,
|
|
"epoch": 7e-05,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.5022886991500854,
|
|
"kl": 0.002840353590727318,
|
|
"learning_rate": 1.7142857142857145e-05,
|
|
"loss": -0.0036,
|
|
"num_tokens": 1164601.0,
|
|
"reward": -0.41255950927734375,
|
|
"reward_std": 0.46968239545822144,
|
|
"rewards/rollout_eval_reward_func/mean": 0.11216971278190613,
|
|
"rewards/rollout_eval_reward_func/std": 0.2204883098602295,
|
|
"rewards/rollout_reward_func/mean": -0.41255950927734375,
|
|
"rewards/rollout_reward_func/std": 0.5122336149215698,
|
|
"sampling/importance_sampling_ratio/max": 1.4158059358596802,
|
|
"sampling/importance_sampling_ratio/mean": 1.0018370151519775,
|
|
"sampling/importance_sampling_ratio/min": 0.7707551121711731,
|
|
"sampling/sampling_logp_difference/max": 0.3476989269256592,
|
|
"sampling/sampling_logp_difference/mean": 0.017664402723312378,
|
|
"step": 7,
|
|
"step_time": 77.99332059699736
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.005842391517944634,
|
|
"clip_ratio/high_mean": 0.0034420291776768863,
|
|
"clip_ratio/low_mean": 0.0051097974355798215,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.008551826613256708,
|
|
"entropy": 0.5001224614679813,
|
|
"epoch": 8e-05,
|
|
"grad_norm": 1.3377231359481812,
|
|
"kl": 0.006958273006603122,
|
|
"learning_rate": 2e-05,
|
|
"loss": -0.0079,
|
|
"step": 8,
|
|
"step_time": 30.119341139003154
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0020833334419876337,
|
|
"clip_ratio/high_mean": 0.0010416667209938169,
|
|
"clip_ratio/low_mean": 0.00046641789958812296,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0015080846205819398,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9987.0,
|
|
"completions/max_terminated_length": 9987.0,
|
|
"completions/mean_length": 8235.9375,
|
|
"completions/mean_terminated_length": 8235.9375,
|
|
"completions/min_length": 2028.0,
|
|
"completions/min_terminated_length": 2028.0,
|
|
"entropy": 0.5665333420038223,
|
|
"epoch": 9e-05,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.413293719291687,
|
|
"kl": 0.012357485480606556,
|
|
"learning_rate": 2.2857142857142858e-05,
|
|
"loss": -0.0089,
|
|
"num_tokens": 1456974.0,
|
|
"reward": -0.2786320447921753,
|
|
"reward_std": 0.4699662923812866,
|
|
"rewards/rollout_eval_reward_func/mean": 0.12322154641151428,
|
|
"rewards/rollout_eval_reward_func/std": 0.23254993557929993,
|
|
"rewards/rollout_reward_func/mean": -0.2786320447921753,
|
|
"rewards/rollout_reward_func/std": 0.510530948638916,
|
|
"sampling/importance_sampling_ratio/max": 1.6322839260101318,
|
|
"sampling/importance_sampling_ratio/mean": 0.9981738328933716,
|
|
"sampling/importance_sampling_ratio/min": 0.6440463662147522,
|
|
"sampling/sampling_logp_difference/max": 0.48998022079467773,
|
|
"sampling/sampling_logp_difference/mean": 0.02640429511666298,
|
|
"step": 9,
|
|
"step_time": 80.34681812299641
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.028179825632832944,
|
|
"clip_ratio/high_mean": 0.01559113833354786,
|
|
"clip_ratio/low_mean": 0.01464278216008097,
|
|
"clip_ratio/low_min": 0.006223290809430182,
|
|
"clip_ratio/region_mean": 0.03023392061004415,
|
|
"entropy": 0.5607042815536261,
|
|
"epoch": 0.0001,
|
|
"grad_norm": 1.2342119216918945,
|
|
"kl": 0.03045007959008217,
|
|
"learning_rate": 2.5714285714285714e-05,
|
|
"loss": -0.0159,
|
|
"step": 10,
|
|
"step_time": 28.650263912999435
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0005208333604969084,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0005208333604969084,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10076.0,
|
|
"completions/max_terminated_length": 10076.0,
|
|
"completions/mean_length": 8311.21875,
|
|
"completions/mean_terminated_length": 8311.21875,
|
|
"completions/min_length": 1530.0,
|
|
"completions/min_terminated_length": 1530.0,
|
|
"entropy": 0.4887528121471405,
|
|
"epoch": 0.00011,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.4407812356948853,
|
|
"kl": 0.04280303395353258,
|
|
"learning_rate": 2.857142857142857e-05,
|
|
"loss": -0.0508,
|
|
"num_tokens": 1751757.0,
|
|
"reward": -0.26280224323272705,
|
|
"reward_std": 0.4824950098991394,
|
|
"rewards/rollout_eval_reward_func/mean": 0.1091209352016449,
|
|
"rewards/rollout_eval_reward_func/std": 0.22141531109809875,
|
|
"rewards/rollout_reward_func/mean": -0.26280224323272705,
|
|
"rewards/rollout_reward_func/std": 0.4825066328048706,
|
|
"sampling/importance_sampling_ratio/max": 2.2060391902923584,
|
|
"sampling/importance_sampling_ratio/mean": 1.003042221069336,
|
|
"sampling/importance_sampling_ratio/min": 0.505047619342804,
|
|
"sampling/sampling_logp_difference/max": 0.79119873046875,
|
|
"sampling/sampling_logp_difference/mean": 0.03998423367738724,
|
|
"step": 11,
|
|
"step_time": 81.20211481799561
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.031166458851657808,
|
|
"clip_ratio/high_mean": 0.01714572956552729,
|
|
"clip_ratio/low_mean": 0.018567851395346224,
|
|
"clip_ratio/low_min": 0.005885701393708587,
|
|
"clip_ratio/region_mean": 0.0357135811354965,
|
|
"entropy": 0.47410433553159237,
|
|
"epoch": 0.00012,
|
|
"grad_norm": 1.048365831375122,
|
|
"kl": 0.08051084214821458,
|
|
"learning_rate": 3.142857142857143e-05,
|
|
"loss": -0.0558,
|
|
"step": 12,
|
|
"step_time": 29.28374841400546
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.001953125,
|
|
"clip_ratio/high_mean": 0.0009765625,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0009765625,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10209.0,
|
|
"completions/max_terminated_length": 10209.0,
|
|
"completions/mean_length": 8161.71875,
|
|
"completions/mean_terminated_length": 8161.71875,
|
|
"completions/min_length": 1827.0,
|
|
"completions/min_terminated_length": 1827.0,
|
|
"entropy": 0.43679925985634327,
|
|
"epoch": 0.00013,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.0560696125030518,
|
|
"kl": 0.09263498219661415,
|
|
"learning_rate": 3.428571428571429e-05,
|
|
"loss": 0.1042,
|
|
"num_tokens": 2042327.0,
|
|
"reward": -0.02590048871934414,
|
|
"reward_std": 0.6161512732505798,
|
|
"rewards/rollout_eval_reward_func/mean": 0.16006097197532654,
|
|
"rewards/rollout_eval_reward_func/std": 0.2864827811717987,
|
|
"rewards/rollout_reward_func/mean": -0.02590048871934414,
|
|
"rewards/rollout_reward_func/std": 0.6041470170021057,
|
|
"sampling/importance_sampling_ratio/max": 2.7582640647888184,
|
|
"sampling/importance_sampling_ratio/mean": 0.9981331825256348,
|
|
"sampling/importance_sampling_ratio/min": 0.361401230096817,
|
|
"sampling/sampling_logp_difference/max": 1.0177664756774902,
|
|
"sampling/sampling_logp_difference/mean": 0.06089622899889946,
|
|
"step": 13,
|
|
"step_time": 85.01218143400365
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.012486383900977671,
|
|
"clip_ratio/high_mean": 0.007805692031979561,
|
|
"clip_ratio/low_mean": 0.030729168094694614,
|
|
"clip_ratio/low_min": 0.015625000465661287,
|
|
"clip_ratio/region_mean": 0.038534860184881836,
|
|
"entropy": 0.41658624820411205,
|
|
"epoch": 0.00014,
|
|
"grad_norm": 1.044942855834961,
|
|
"kl": 0.16313170175999403,
|
|
"learning_rate": 3.7142857142857143e-05,
|
|
"loss": 0.1002,
|
|
"step": 14,
|
|
"step_time": 28.990433916003894
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.00596590933855623,
|
|
"clip_ratio/high_mean": 0.0035037880297750235,
|
|
"clip_ratio/low_mean": 0.0005122950533404946,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.004016083083115518,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10134.0,
|
|
"completions/max_terminated_length": 10134.0,
|
|
"completions/mean_length": 8323.34375,
|
|
"completions/mean_terminated_length": 8323.34375,
|
|
"completions/min_length": 1934.0,
|
|
"completions/min_terminated_length": 1934.0,
|
|
"entropy": 0.44160761684179306,
|
|
"epoch": 0.00015,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.2544862031936646,
|
|
"kl": 0.21658248733729124,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.1199,
|
|
"num_tokens": 2337711.0,
|
|
"reward": -0.0776321142911911,
|
|
"reward_std": 0.5812347531318665,
|
|
"rewards/rollout_eval_reward_func/mean": 0.14151422679424286,
|
|
"rewards/rollout_eval_reward_func/std": 0.2538794279098511,
|
|
"rewards/rollout_reward_func/mean": -0.0776321142911911,
|
|
"rewards/rollout_reward_func/std": 0.5845968723297119,
|
|
"sampling/importance_sampling_ratio/max": 1.8725090026855469,
|
|
"sampling/importance_sampling_ratio/mean": 0.9912927150726318,
|
|
"sampling/importance_sampling_ratio/min": 0.1565917581319809,
|
|
"sampling/sampling_logp_difference/max": 1.8541131019592285,
|
|
"sampling/sampling_logp_difference/mean": 0.06762713938951492,
|
|
"step": 15,
|
|
"step_time": 87.63701662399762
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.033285985700786114,
|
|
"clip_ratio/high_mean": 0.02006899402476847,
|
|
"clip_ratio/low_mean": 0.017902423918712884,
|
|
"clip_ratio/low_min": 0.008303140406496823,
|
|
"clip_ratio/region_mean": 0.03797141805989668,
|
|
"entropy": 0.43832515366375446,
|
|
"epoch": 0.00016,
|
|
"grad_norm": 1.1862040758132935,
|
|
"kl": 0.2433762801811099,
|
|
"learning_rate": 4.2857142857142856e-05,
|
|
"loss": 0.1137,
|
|
"step": 16,
|
|
"step_time": 30.26940473900322
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.005208333604969084,
|
|
"clip_ratio/high_mean": 0.002604166802484542,
|
|
"clip_ratio/low_mean": 0.0005208333604969084,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0031250001629814506,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10099.0,
|
|
"completions/max_terminated_length": 10099.0,
|
|
"completions/mean_length": 8931.625,
|
|
"completions/mean_terminated_length": 8931.625,
|
|
"completions/min_length": 2013.0,
|
|
"completions/min_terminated_length": 2013.0,
|
|
"entropy": 0.4058182891458273,
|
|
"epoch": 0.00017,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.206827998161316,
|
|
"kl": 0.1886005294509232,
|
|
"learning_rate": 4.5714285714285716e-05,
|
|
"loss": -0.0944,
|
|
"num_tokens": 2652765.0,
|
|
"reward": -0.0752231553196907,
|
|
"reward_std": 0.48041343688964844,
|
|
"rewards/rollout_eval_reward_func/mean": 0.10861280560493469,
|
|
"rewards/rollout_eval_reward_func/std": 0.2368263602256775,
|
|
"rewards/rollout_reward_func/mean": -0.0752231553196907,
|
|
"rewards/rollout_reward_func/std": 0.5091694593429565,
|
|
"sampling/importance_sampling_ratio/max": 2.2689177989959717,
|
|
"sampling/importance_sampling_ratio/mean": 1.0046234130859375,
|
|
"sampling/importance_sampling_ratio/min": 0.1846628040075302,
|
|
"sampling/sampling_logp_difference/max": 1.6892237663269043,
|
|
"sampling/sampling_logp_difference/mean": 0.06120520830154419,
|
|
"step": 17,
|
|
"step_time": 96.5394253049999
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0221070961561054,
|
|
"clip_ratio/high_mean": 0.013136881520040333,
|
|
"clip_ratio/low_mean": 0.005389189289417118,
|
|
"clip_ratio/low_min": 0.002066256827674806,
|
|
"clip_ratio/region_mean": 0.01852607080945745,
|
|
"entropy": 0.40752917528152466,
|
|
"epoch": 0.00018,
|
|
"grad_norm": 1.039859652519226,
|
|
"kl": 0.20007089478895068,
|
|
"learning_rate": 4.8571428571428576e-05,
|
|
"loss": -0.1064,
|
|
"step": 18,
|
|
"step_time": 29.607819763001316
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.00424107164144516,
|
|
"clip_ratio/high_mean": 0.00212053582072258,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.00212053582072258,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9868.0,
|
|
"completions/max_terminated_length": 9868.0,
|
|
"completions/mean_length": 7739.625,
|
|
"completions/mean_terminated_length": 7739.625,
|
|
"completions/min_length": 1494.0,
|
|
"completions/min_terminated_length": 1494.0,
|
|
"entropy": 0.3824189379811287,
|
|
"epoch": 0.00019,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.1822205781936646,
|
|
"kl": 0.1706448094919324,
|
|
"learning_rate": 5.142857142857143e-05,
|
|
"loss": -0.1187,
|
|
"num_tokens": 2929452.0,
|
|
"reward": 0.1796756088733673,
|
|
"reward_std": 0.6716787815093994,
|
|
"rewards/rollout_eval_reward_func/mean": 0.25978150963783264,
|
|
"rewards/rollout_eval_reward_func/std": 0.31619328260421753,
|
|
"rewards/rollout_reward_func/mean": 0.1796756088733673,
|
|
"rewards/rollout_reward_func/std": 0.6625394821166992,
|
|
"sampling/importance_sampling_ratio/max": 1.8655627965927124,
|
|
"sampling/importance_sampling_ratio/mean": 1.0000479221343994,
|
|
"sampling/importance_sampling_ratio/min": 0.33482789993286133,
|
|
"sampling/sampling_logp_difference/max": 1.0941386222839355,
|
|
"sampling/sampling_logp_difference/mean": 0.04819408059120178,
|
|
"step": 19,
|
|
"step_time": 92.65558583299753
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.030015080701559782,
|
|
"clip_ratio/high_mean": 0.018132540630176663,
|
|
"clip_ratio/low_mean": 0.03180725604761392,
|
|
"clip_ratio/low_min": 0.0052083334885537624,
|
|
"clip_ratio/region_mean": 0.049939796910621226,
|
|
"entropy": 0.3580914381891489,
|
|
"epoch": 0.0002,
|
|
"grad_norm": 1.152976155281067,
|
|
"kl": 0.2634436935186386,
|
|
"learning_rate": 5.428571428571428e-05,
|
|
"loss": -0.1272,
|
|
"step": 20,
|
|
"step_time": 28.27301450500272
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0020833335001952946,
|
|
"clip_ratio/low_min": 0.0010416667209938169,
|
|
"clip_ratio/region_mean": 0.0020833335001952946,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10426.0,
|
|
"completions/max_terminated_length": 10426.0,
|
|
"completions/mean_length": 7911.40625,
|
|
"completions/mean_terminated_length": 7911.40625,
|
|
"completions/min_length": 1040.0,
|
|
"completions/min_terminated_length": 1040.0,
|
|
"entropy": 0.3455618601292372,
|
|
"epoch": 0.00021,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.9142677187919617,
|
|
"kl": 0.2354841867927462,
|
|
"learning_rate": 5.714285714285714e-05,
|
|
"loss": -0.0904,
|
|
"num_tokens": 3211621.0,
|
|
"reward": 0.09562171995639801,
|
|
"reward_std": 0.6017146706581116,
|
|
"rewards/rollout_eval_reward_func/mean": 0.1835619956254959,
|
|
"rewards/rollout_eval_reward_func/std": 0.2800058424472809,
|
|
"rewards/rollout_reward_func/mean": 0.09562171995639801,
|
|
"rewards/rollout_reward_func/std": 0.5979344248771667,
|
|
"sampling/importance_sampling_ratio/max": 1.7227435111999512,
|
|
"sampling/importance_sampling_ratio/mean": 0.9981924295425415,
|
|
"sampling/importance_sampling_ratio/min": 0.38243889808654785,
|
|
"sampling/sampling_logp_difference/max": 0.961186408996582,
|
|
"sampling/sampling_logp_difference/mean": 0.04361895099282265,
|
|
"step": 21,
|
|
"step_time": 94.40408171299714
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.03222161578014493,
|
|
"clip_ratio/high_mean": 0.0181941413320601,
|
|
"clip_ratio/low_mean": 0.02708333428017795,
|
|
"clip_ratio/low_min": 0.0062500000931322575,
|
|
"clip_ratio/region_mean": 0.04527747584506869,
|
|
"entropy": 0.3229655371978879,
|
|
"epoch": 0.00022,
|
|
"grad_norm": 0.8647798895835876,
|
|
"kl": 0.21354854525998235,
|
|
"learning_rate": 6e-05,
|
|
"loss": -0.1008,
|
|
"step": 22,
|
|
"step_time": 30.11174104199381
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0011160714784637094,
|
|
"clip_ratio/high_mean": 0.0005580357392318547,
|
|
"clip_ratio/low_mean": 0.0010995370685122907,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0016575728077441454,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10470.0,
|
|
"completions/max_terminated_length": 10470.0,
|
|
"completions/mean_length": 7568.375,
|
|
"completions/mean_terminated_length": 7568.375,
|
|
"completions/min_length": 2202.0,
|
|
"completions/min_terminated_length": 2202.0,
|
|
"entropy": 0.28525836300104856,
|
|
"epoch": 0.00023,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.0814907550811768,
|
|
"kl": 0.35280791157856584,
|
|
"learning_rate": 6.285714285714286e-05,
|
|
"loss": 0.016,
|
|
"num_tokens": 3482436.0,
|
|
"reward": 0.2800288200378418,
|
|
"reward_std": 0.7106037139892578,
|
|
"rewards/rollout_eval_reward_func/mean": 0.33079269528388977,
|
|
"rewards/rollout_eval_reward_func/std": 0.3271085023880005,
|
|
"rewards/rollout_reward_func/mean": 0.2800288200378418,
|
|
"rewards/rollout_reward_func/std": 0.6996307373046875,
|
|
"sampling/importance_sampling_ratio/max": 1.6482936143875122,
|
|
"sampling/importance_sampling_ratio/mean": 1.0002542734146118,
|
|
"sampling/importance_sampling_ratio/min": 0.2758394777774811,
|
|
"sampling/sampling_logp_difference/max": 1.2879362106323242,
|
|
"sampling/sampling_logp_difference/mean": 0.0332026481628418,
|
|
"step": 23,
|
|
"step_time": 93.99063302500326
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.01396139187272638,
|
|
"clip_ratio/high_mean": 0.007690923230256885,
|
|
"clip_ratio/low_mean": 0.01880787085974589,
|
|
"clip_ratio/low_min": 0.0031250001629814506,
|
|
"clip_ratio/region_mean": 0.02649879432283342,
|
|
"entropy": 0.2676102966070175,
|
|
"epoch": 0.00024,
|
|
"grad_norm": 0.8727543354034424,
|
|
"kl": 0.3772396189160645,
|
|
"learning_rate": 6.571428571428571e-05,
|
|
"loss": 0.0057,
|
|
"step": 24,
|
|
"step_time": 29.4178187339985
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.004613095428794622,
|
|
"clip_ratio/high_mean": 0.002985895553138107,
|
|
"clip_ratio/low_mean": 0.0005208333604969084,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0035067289136350155,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9569.0,
|
|
"completions/max_terminated_length": 9569.0,
|
|
"completions/mean_length": 7533.28125,
|
|
"completions/mean_terminated_length": 7533.28125,
|
|
"completions/min_length": 2449.0,
|
|
"completions/min_terminated_length": 2449.0,
|
|
"entropy": 0.2505391649901867,
|
|
"epoch": 0.00025,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.017386555671692,
|
|
"kl": 0.21523633878678083,
|
|
"learning_rate": 6.857142857142858e-05,
|
|
"loss": 0.0242,
|
|
"num_tokens": 3751699.0,
|
|
"reward": 0.3911706805229187,
|
|
"reward_std": 0.638326108455658,
|
|
"rewards/rollout_eval_reward_func/mean": 0.36318597197532654,
|
|
"rewards/rollout_eval_reward_func/std": 0.3184514343738556,
|
|
"rewards/rollout_reward_func/mean": 0.3911706805229187,
|
|
"rewards/rollout_reward_func/std": 0.6562069654464722,
|
|
"sampling/importance_sampling_ratio/max": 1.5404945611953735,
|
|
"sampling/importance_sampling_ratio/mean": 0.9984301328659058,
|
|
"sampling/importance_sampling_ratio/min": 0.4790920615196228,
|
|
"sampling/sampling_logp_difference/max": 0.7358624935150146,
|
|
"sampling/sampling_logp_difference/mean": 0.025531694293022156,
|
|
"step": 25,
|
|
"step_time": 92.37763964700025
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.02074831852223724,
|
|
"clip_ratio/high_mean": 0.014075796061661094,
|
|
"clip_ratio/low_mean": 0.024038826406467706,
|
|
"clip_ratio/low_min": 0.004687500186264515,
|
|
"clip_ratio/region_mean": 0.0381146224681288,
|
|
"entropy": 0.24146342556923628,
|
|
"epoch": 0.00026,
|
|
"grad_norm": 1.08539617061615,
|
|
"kl": 0.242179695982486,
|
|
"learning_rate": 7.142857142857143e-05,
|
|
"loss": 0.0152,
|
|
"step": 26,
|
|
"step_time": 27.09601488199405
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.004924242617562413,
|
|
"clip_ratio/high_mean": 0.0024621213087812066,
|
|
"clip_ratio/low_mean": 0.0015625000814907253,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.004024621390271932,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9714.0,
|
|
"completions/max_terminated_length": 9714.0,
|
|
"completions/mean_length": 7341.125,
|
|
"completions/mean_terminated_length": 7341.125,
|
|
"completions/min_length": 834.0,
|
|
"completions/min_terminated_length": 834.0,
|
|
"entropy": 0.24662253353744745,
|
|
"epoch": 0.00027,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.0926475524902344,
|
|
"kl": 0.201888975687325,
|
|
"learning_rate": 7.428571428571429e-05,
|
|
"loss": -0.0651,
|
|
"num_tokens": 4014835.0,
|
|
"reward": 0.26619410514831543,
|
|
"reward_std": 0.6366387009620667,
|
|
"rewards/rollout_eval_reward_func/mean": 0.31529471278190613,
|
|
"rewards/rollout_eval_reward_func/std": 0.3177616000175476,
|
|
"rewards/rollout_reward_func/mean": 0.26619410514831543,
|
|
"rewards/rollout_reward_func/std": 0.6645346879959106,
|
|
"sampling/importance_sampling_ratio/max": 1.7210402488708496,
|
|
"sampling/importance_sampling_ratio/mean": 0.9990845918655396,
|
|
"sampling/importance_sampling_ratio/min": 0.46208029985427856,
|
|
"sampling/sampling_logp_difference/max": 0.7720166444778442,
|
|
"sampling/sampling_logp_difference/mean": 0.024712545797228813,
|
|
"step": 27,
|
|
"step_time": 90.07543276499928
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.033208509092219174,
|
|
"clip_ratio/high_mean": 0.018557379313278943,
|
|
"clip_ratio/low_mean": 0.035281969350762665,
|
|
"clip_ratio/low_min": 0.011458333698101342,
|
|
"clip_ratio/region_mean": 0.05383934878045693,
|
|
"entropy": 0.24193121027201414,
|
|
"epoch": 0.00028,
|
|
"grad_norm": 0.9876235127449036,
|
|
"kl": 0.26401366433128715,
|
|
"learning_rate": 7.714285714285715e-05,
|
|
"loss": -0.073,
|
|
"step": 28,
|
|
"step_time": 27.219164144002207
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0010775862028822303,
|
|
"clip_ratio/high_mean": 0.0005387931014411151,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0005387931014411151,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9855.0,
|
|
"completions/max_terminated_length": 9855.0,
|
|
"completions/mean_length": 7361.875,
|
|
"completions/mean_terminated_length": 7361.875,
|
|
"completions/min_length": 842.0,
|
|
"completions/min_terminated_length": 842.0,
|
|
"entropy": 0.21860306337475777,
|
|
"epoch": 0.00029,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.9292377233505249,
|
|
"kl": 0.20060417288914323,
|
|
"learning_rate": 8e-05,
|
|
"loss": -0.0977,
|
|
"num_tokens": 4278506.0,
|
|
"reward": 0.30670806765556335,
|
|
"reward_std": 0.652392566204071,
|
|
"rewards/rollout_eval_reward_func/mean": 0.3278709352016449,
|
|
"rewards/rollout_eval_reward_func/std": 0.31351709365844727,
|
|
"rewards/rollout_reward_func/mean": 0.30670806765556335,
|
|
"rewards/rollout_reward_func/std": 0.6815608143806458,
|
|
"sampling/importance_sampling_ratio/max": 1.4481010437011719,
|
|
"sampling/importance_sampling_ratio/mean": 1.0026426315307617,
|
|
"sampling/importance_sampling_ratio/min": 0.5693169832229614,
|
|
"sampling/sampling_logp_difference/max": 0.5633178949356079,
|
|
"sampling/sampling_logp_difference/mean": 0.01894025132060051,
|
|
"step": 29,
|
|
"step_time": 88.37378997200358
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.02580322092399001,
|
|
"clip_ratio/high_mean": 0.015042814193293452,
|
|
"clip_ratio/low_mean": 0.015608090267051011,
|
|
"clip_ratio/low_min": 0.0020833334419876337,
|
|
"clip_ratio/region_mean": 0.030650904460344464,
|
|
"entropy": 0.2232473948970437,
|
|
"epoch": 0.0003,
|
|
"grad_norm": 0.6086679697036743,
|
|
"kl": 0.19415233470499516,
|
|
"learning_rate": 8.285714285714287e-05,
|
|
"loss": -0.1081,
|
|
"step": 30,
|
|
"step_time": 28.619991764000588
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0005208333604969084,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0005208333604969084,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10726.0,
|
|
"completions/max_terminated_length": 10726.0,
|
|
"completions/mean_length": 7164.65625,
|
|
"completions/mean_terminated_length": 7164.65625,
|
|
"completions/min_length": 470.0,
|
|
"completions/min_terminated_length": 470.0,
|
|
"entropy": 0.23761425912380219,
|
|
"epoch": 0.00031,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.050552487373352,
|
|
"kl": 0.25638002483174205,
|
|
"learning_rate": 8.571428571428571e-05,
|
|
"loss": 0.012,
|
|
"num_tokens": 4536097.0,
|
|
"reward": 0.3345087766647339,
|
|
"reward_std": 0.5485296249389648,
|
|
"rewards/rollout_eval_reward_func/mean": 0.3090701103210449,
|
|
"rewards/rollout_eval_reward_func/std": 0.32714226841926575,
|
|
"rewards/rollout_reward_func/mean": 0.3345087766647339,
|
|
"rewards/rollout_reward_func/std": 0.6012357473373413,
|
|
"sampling/importance_sampling_ratio/max": 1.438549518585205,
|
|
"sampling/importance_sampling_ratio/mean": 1.0011037588119507,
|
|
"sampling/importance_sampling_ratio/min": 0.6349728107452393,
|
|
"sampling/sampling_logp_difference/max": 0.45417308807373047,
|
|
"sampling/sampling_logp_difference/mean": 0.015337169170379639,
|
|
"step": 31,
|
|
"step_time": 92.49027231299806
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.03391559107694775,
|
|
"clip_ratio/high_mean": 0.018867517996113747,
|
|
"clip_ratio/low_mean": 0.044338769221212715,
|
|
"clip_ratio/low_min": 0.008333333535119891,
|
|
"clip_ratio/region_mean": 0.06320628756657243,
|
|
"entropy": 0.22916866652667522,
|
|
"epoch": 0.00032,
|
|
"grad_norm": 1.028586745262146,
|
|
"kl": 0.3105860697105527,
|
|
"learning_rate": 8.857142857142857e-05,
|
|
"loss": 0.0055,
|
|
"step": 32,
|
|
"step_time": 29.399824877003994
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0024519230937585235,
|
|
"clip_ratio/high_mean": 0.0012259615468792617,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0012259615468792617,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10128.0,
|
|
"completions/max_terminated_length": 10128.0,
|
|
"completions/mean_length": 7357.46875,
|
|
"completions/mean_terminated_length": 7357.46875,
|
|
"completions/min_length": 1917.0,
|
|
"completions/min_terminated_length": 1917.0,
|
|
"entropy": 0.2557551637291908,
|
|
"epoch": 0.00033,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.9717881083488464,
|
|
"kl": 0.2046954189427197,
|
|
"learning_rate": 9.142857142857143e-05,
|
|
"loss": 0.0245,
|
|
"num_tokens": 4799621.0,
|
|
"reward": 0.35216301679611206,
|
|
"reward_std": 0.6164546608924866,
|
|
"rewards/rollout_eval_reward_func/mean": 0.3365091383457184,
|
|
"rewards/rollout_eval_reward_func/std": 0.3354848027229309,
|
|
"rewards/rollout_reward_func/mean": 0.35216301679611206,
|
|
"rewards/rollout_reward_func/std": 0.6309141516685486,
|
|
"sampling/importance_sampling_ratio/max": 1.333243727684021,
|
|
"sampling/importance_sampling_ratio/mean": 1.0005223751068115,
|
|
"sampling/importance_sampling_ratio/min": 0.7339702248573303,
|
|
"sampling/sampling_logp_difference/max": 0.30928683280944824,
|
|
"sampling/sampling_logp_difference/mean": 0.014704002998769283,
|
|
"step": 33,
|
|
"step_time": 89.53553034700417
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.01991061063017696,
|
|
"clip_ratio/high_mean": 0.011966300604399294,
|
|
"clip_ratio/low_mean": 0.02272569522028789,
|
|
"clip_ratio/low_min": 0.009722222457639873,
|
|
"clip_ratio/region_mean": 0.03469199570827186,
|
|
"entropy": 0.2428069869056344,
|
|
"epoch": 0.00034,
|
|
"grad_norm": 0.685612142086029,
|
|
"kl": 0.2513351505622268,
|
|
"learning_rate": 9.428571428571429e-05,
|
|
"loss": 0.0129,
|
|
"step": 34,
|
|
"step_time": 28.25809028400181
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0021990741370245814,
|
|
"clip_ratio/high_mean": 0.0010995370685122907,
|
|
"clip_ratio/low_mean": 0.0010416667209938169,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0021412037895061076,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9665.0,
|
|
"completions/max_terminated_length": 9665.0,
|
|
"completions/mean_length": 8000.09375,
|
|
"completions/mean_terminated_length": 8000.09375,
|
|
"completions/min_length": 4295.0,
|
|
"completions/min_terminated_length": 4295.0,
|
|
"entropy": 0.2354184165596962,
|
|
"epoch": 0.00035,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.040405035018921,
|
|
"kl": 0.1770010399632156,
|
|
"learning_rate": 9.714285714285715e-05,
|
|
"loss": 0.1517,
|
|
"num_tokens": 5084103.0,
|
|
"reward": 0.3385156989097595,
|
|
"reward_std": 0.5189785957336426,
|
|
"rewards/rollout_eval_reward_func/mean": 0.23996442556381226,
|
|
"rewards/rollout_eval_reward_func/std": 0.31991085410118103,
|
|
"rewards/rollout_reward_func/mean": 0.3385156989097595,
|
|
"rewards/rollout_reward_func/std": 0.5693588852882385,
|
|
"sampling/importance_sampling_ratio/max": 1.4071576595306396,
|
|
"sampling/importance_sampling_ratio/mean": 0.9996304512023926,
|
|
"sampling/importance_sampling_ratio/min": 0.5387703776359558,
|
|
"sampling/sampling_logp_difference/max": 0.6184659004211426,
|
|
"sampling/sampling_logp_difference/mean": 0.015029089525341988,
|
|
"step": 35,
|
|
"step_time": 95.05921310200392
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.026263557723723352,
|
|
"clip_ratio/high_mean": 0.014173445466440171,
|
|
"clip_ratio/low_mean": 0.02787990286014974,
|
|
"clip_ratio/low_min": 0.007291667046956718,
|
|
"clip_ratio/region_mean": 0.04205334832658991,
|
|
"entropy": 0.21858789399266243,
|
|
"epoch": 0.00036,
|
|
"grad_norm": 1.0455042123794556,
|
|
"kl": 0.2051441869698465,
|
|
"learning_rate": 0.0001,
|
|
"loss": 0.1403,
|
|
"step": 36,
|
|
"step_time": 27.85193802100366
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.007164939888752997,
|
|
"clip_ratio/high_mean": 0.0035824699443764985,
|
|
"clip_ratio/low_mean": 0.0015625000232830644,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.005144969967659563,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9514.0,
|
|
"completions/max_terminated_length": 9514.0,
|
|
"completions/mean_length": 6029.25,
|
|
"completions/mean_terminated_length": 6029.25,
|
|
"completions/min_length": 1061.0,
|
|
"completions/min_terminated_length": 1061.0,
|
|
"entropy": 0.21716525312513113,
|
|
"epoch": 0.00037,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.8604521751403809,
|
|
"kl": 0.23097522975876927,
|
|
"learning_rate": 9.999736485702831e-05,
|
|
"loss": -0.0709,
|
|
"num_tokens": 5305345.0,
|
|
"reward": 0.41453179717063904,
|
|
"reward_std": 0.7797224521636963,
|
|
"rewards/rollout_eval_reward_func/mean": 0.4568089246749878,
|
|
"rewards/rollout_eval_reward_func/std": 0.28734299540519714,
|
|
"rewards/rollout_reward_func/mean": 0.41453179717063904,
|
|
"rewards/rollout_reward_func/std": 0.755694568157196,
|
|
"sampling/importance_sampling_ratio/max": 1.4738141298294067,
|
|
"sampling/importance_sampling_ratio/mean": 1.000828742980957,
|
|
"sampling/importance_sampling_ratio/min": 0.7324953079223633,
|
|
"sampling/sampling_logp_difference/max": 0.3878536820411682,
|
|
"sampling/sampling_logp_difference/mean": 0.013184964656829834,
|
|
"step": 37,
|
|
"step_time": 76.87407001600332
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.04774210066534579,
|
|
"clip_ratio/high_mean": 0.02752261853311211,
|
|
"clip_ratio/low_mean": 0.03158482233993709,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.059107440523803234,
|
|
"entropy": 0.21499714627861977,
|
|
"epoch": 0.00038,
|
|
"grad_norm": 1.026845932006836,
|
|
"kl": 0.3676267918199301,
|
|
"learning_rate": 9.998945979845876e-05,
|
|
"loss": -0.0694,
|
|
"step": 38,
|
|
"step_time": 27.58343887600313
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.006285919691435993,
|
|
"clip_ratio/high_mean": 0.0031429598457179964,
|
|
"clip_ratio/low_mean": 0.0010416667209938169,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.004184626566711813,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9353.0,
|
|
"completions/max_terminated_length": 9353.0,
|
|
"completions/mean_length": 6221.78125,
|
|
"completions/mean_terminated_length": 6221.78125,
|
|
"completions/min_length": 1175.0,
|
|
"completions/min_terminated_length": 1175.0,
|
|
"entropy": 0.21314978785812855,
|
|
"epoch": 0.00039,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.1063776016235352,
|
|
"kl": 0.28443425707519054,
|
|
"learning_rate": 9.997628593527586e-05,
|
|
"loss": 0.1657,
|
|
"num_tokens": 5533203.0,
|
|
"reward": 0.5931290984153748,
|
|
"reward_std": 0.5068180561065674,
|
|
"rewards/rollout_eval_reward_func/mean": 0.4369918704032898,
|
|
"rewards/rollout_eval_reward_func/std": 0.2919425666332245,
|
|
"rewards/rollout_reward_func/mean": 0.5931290984153748,
|
|
"rewards/rollout_reward_func/std": 0.6152276396751404,
|
|
"sampling/importance_sampling_ratio/max": 1.4768017530441284,
|
|
"sampling/importance_sampling_ratio/mean": 0.9989122152328491,
|
|
"sampling/importance_sampling_ratio/min": 0.7442160248756409,
|
|
"sampling/sampling_logp_difference/max": 0.3898787498474121,
|
|
"sampling/sampling_logp_difference/mean": 0.011076296679675579,
|
|
"step": 39,
|
|
"step_time": 80.26773473300273
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.03581550612580031,
|
|
"clip_ratio/high_mean": 0.021467003040015697,
|
|
"clip_ratio/low_mean": 0.019476010755170137,
|
|
"clip_ratio/low_min": 0.0031250000465661287,
|
|
"clip_ratio/region_mean": 0.04094301396980882,
|
|
"entropy": 0.2001811731606722,
|
|
"epoch": 0.0004,
|
|
"grad_norm": 0.8571550250053406,
|
|
"kl": 0.39517259504646063,
|
|
"learning_rate": 9.995784511894694e-05,
|
|
"loss": 0.1561,
|
|
"step": 40,
|
|
"step_time": 26.113719172002675
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0027173913549631834,
|
|
"clip_ratio/high_mean": 0.0013586956774815917,
|
|
"clip_ratio/low_mean": 0.003238224715460092,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0045969203929416835,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9876.0,
|
|
"completions/max_terminated_length": 9876.0,
|
|
"completions/mean_length": 7216.5625,
|
|
"completions/mean_terminated_length": 7216.5625,
|
|
"completions/min_length": 1879.0,
|
|
"completions/min_terminated_length": 1879.0,
|
|
"entropy": 0.2681358586996794,
|
|
"epoch": 0.00041,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.3309671878814697,
|
|
"kl": 0.2410402470268309,
|
|
"learning_rate": 9.993413994116206e-05,
|
|
"loss": 0.1903,
|
|
"num_tokens": 5792478.0,
|
|
"reward": 0.471214234828949,
|
|
"reward_std": 0.5625734329223633,
|
|
"rewards/rollout_eval_reward_func/mean": 0.3643292784690857,
|
|
"rewards/rollout_eval_reward_func/std": 0.34053289890289307,
|
|
"rewards/rollout_reward_func/mean": 0.471214234828949,
|
|
"rewards/rollout_reward_func/std": 0.6072424650192261,
|
|
"sampling/importance_sampling_ratio/max": 1.8356192111968994,
|
|
"sampling/importance_sampling_ratio/mean": 1.0007987022399902,
|
|
"sampling/importance_sampling_ratio/min": 0.4829617738723755,
|
|
"sampling/sampling_logp_difference/max": 0.7278177738189697,
|
|
"sampling/sampling_logp_difference/mean": 0.014709306880831718,
|
|
"step": 41,
|
|
"step_time": 87.47009326799707
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.034506134572438896,
|
|
"clip_ratio/high_mean": 0.01836913888109848,
|
|
"clip_ratio/low_mean": 0.03956068912521005,
|
|
"clip_ratio/low_min": 0.012500000651925802,
|
|
"clip_ratio/region_mean": 0.05792982783168554,
|
|
"entropy": 0.27205855678766966,
|
|
"epoch": 0.00042,
|
|
"grad_norm": 1.0188957452774048,
|
|
"kl": 0.30527770798653364,
|
|
"learning_rate": 9.990517373346957e-05,
|
|
"loss": 0.1841,
|
|
"step": 42,
|
|
"step_time": 27.952364619004584
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.005300949211232364,
|
|
"clip_ratio/high_mean": 0.002650474605616182,
|
|
"clip_ratio/low_mean": 0.0015625000814907253,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.004212974687106907,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10094.0,
|
|
"completions/max_terminated_length": 10094.0,
|
|
"completions/mean_length": 6369.84375,
|
|
"completions/mean_terminated_length": 6369.84375,
|
|
"completions/min_length": 701.0,
|
|
"completions/min_terminated_length": 701.0,
|
|
"entropy": 0.24548510648310184,
|
|
"epoch": 0.00043,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.8371849656105042,
|
|
"kl": 0.22607734380289912,
|
|
"learning_rate": 9.98709505668081e-05,
|
|
"loss": -0.1383,
|
|
"num_tokens": 6024570.0,
|
|
"reward": 0.5083565711975098,
|
|
"reward_std": 0.7129669785499573,
|
|
"rewards/rollout_eval_reward_func/mean": 0.4181910753250122,
|
|
"rewards/rollout_eval_reward_func/std": 0.3106958866119385,
|
|
"rewards/rollout_reward_func/mean": 0.5083565711975098,
|
|
"rewards/rollout_reward_func/std": 0.679851770401001,
|
|
"sampling/importance_sampling_ratio/max": 1.6035348176956177,
|
|
"sampling/importance_sampling_ratio/mean": 1.0009479522705078,
|
|
"sampling/importance_sampling_ratio/min": 0.7113155722618103,
|
|
"sampling/sampling_logp_difference/max": 0.4722104072570801,
|
|
"sampling/sampling_logp_difference/mean": 0.010827964171767235,
|
|
"step": 43,
|
|
"step_time": 81.8608712560017
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.022805775748565793,
|
|
"clip_ratio/high_mean": 0.01218413794413209,
|
|
"clip_ratio/low_mean": 0.026488096278626472,
|
|
"clip_ratio/low_min": 0.0020833334419876337,
|
|
"clip_ratio/region_mean": 0.03867223463021219,
|
|
"entropy": 0.2484031356871128,
|
|
"epoch": 0.00044,
|
|
"grad_norm": 0.6352972388267517,
|
|
"kl": 0.24903920874930918,
|
|
"learning_rate": 9.983147525093428e-05,
|
|
"loss": -0.1456,
|
|
"step": 44,
|
|
"step_time": 28.312056484001005
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0020833334419876337,
|
|
"clip_ratio/high_mean": 0.0010416667209938169,
|
|
"clip_ratio/low_mean": 0.0010416667209938169,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0020833334419876337,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10030.0,
|
|
"completions/max_terminated_length": 10030.0,
|
|
"completions/mean_length": 7470.40625,
|
|
"completions/mean_terminated_length": 7470.40625,
|
|
"completions/min_length": 3212.0,
|
|
"completions/min_terminated_length": 3212.0,
|
|
"entropy": 0.26859680097550154,
|
|
"epoch": 0.00045,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.9950742125511169,
|
|
"kl": 0.2715269709005952,
|
|
"learning_rate": 9.978675333374685e-05,
|
|
"loss": 0.1354,
|
|
"num_tokens": 6292193.0,
|
|
"reward": 0.31536591053009033,
|
|
"reward_std": 0.626213550567627,
|
|
"rewards/rollout_eval_reward_func/mean": 0.2950965166091919,
|
|
"rewards/rollout_eval_reward_func/std": 0.3288768529891968,
|
|
"rewards/rollout_reward_func/mean": 0.31536591053009033,
|
|
"rewards/rollout_reward_func/std": 0.6272794604301453,
|
|
"sampling/importance_sampling_ratio/max": 1.2761257886886597,
|
|
"sampling/importance_sampling_ratio/mean": 0.9995177388191223,
|
|
"sampling/importance_sampling_ratio/min": 0.6398259401321411,
|
|
"sampling/sampling_logp_difference/max": 0.44655919075012207,
|
|
"sampling/sampling_logp_difference/mean": 0.01289924792945385,
|
|
"step": 45,
|
|
"step_time": 89.98842330299703
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.029475471819750965,
|
|
"clip_ratio/high_mean": 0.017039196158293635,
|
|
"clip_ratio/low_mean": 0.035884891636669636,
|
|
"clip_ratio/low_min": 0.014583333861082792,
|
|
"clip_ratio/region_mean": 0.05292408773675561,
|
|
"entropy": 0.25596251618117094,
|
|
"epoch": 0.00046,
|
|
"grad_norm": 1.0492225885391235,
|
|
"kl": 0.4555607410147786,
|
|
"learning_rate": 9.973679110050689e-05,
|
|
"loss": 0.1236,
|
|
"step": 46,
|
|
"step_time": 28.10059149600238
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.005558473523706198,
|
|
"clip_ratio/high_mean": 0.002779236761853099,
|
|
"clip_ratio/low_mean": 0.0031250001629814506,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.005904236924834549,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10171.0,
|
|
"completions/max_terminated_length": 10171.0,
|
|
"completions/mean_length": 7720.34375,
|
|
"completions/mean_terminated_length": 7720.34375,
|
|
"completions/min_length": 2255.0,
|
|
"completions/min_terminated_length": 2255.0,
|
|
"entropy": 0.21848125476390123,
|
|
"epoch": 0.00047,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.9580699801445007,
|
|
"kl": 0.2126072864048183,
|
|
"learning_rate": 9.968159557295458e-05,
|
|
"loss": 0.2391,
|
|
"num_tokens": 6567972.0,
|
|
"reward": 0.585047721862793,
|
|
"reward_std": 0.4849390387535095,
|
|
"rewards/rollout_eval_reward_func/mean": 0.35200709104537964,
|
|
"rewards/rollout_eval_reward_func/std": 0.33855971693992615,
|
|
"rewards/rollout_reward_func/mean": 0.585047721862793,
|
|
"rewards/rollout_reward_func/std": 0.4694308936595917,
|
|
"sampling/importance_sampling_ratio/max": 1.3900582790374756,
|
|
"sampling/importance_sampling_ratio/mean": 1.0005149841308594,
|
|
"sampling/importance_sampling_ratio/min": 0.5463369488716125,
|
|
"sampling/sampling_logp_difference/max": 0.6045193672180176,
|
|
"sampling/sampling_logp_difference/mean": 0.012745920568704605,
|
|
"step": 47,
|
|
"step_time": 91.15270540599704
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.03133936191443354,
|
|
"clip_ratio/high_mean": 0.017232181096915156,
|
|
"clip_ratio/low_mean": 0.04218750132713467,
|
|
"clip_ratio/low_min": 0.01145833358168602,
|
|
"clip_ratio/region_mean": 0.059419682365842164,
|
|
"entropy": 0.23045554850250483,
|
|
"epoch": 0.00048,
|
|
"grad_norm": 1.2474925518035889,
|
|
"kl": 0.18294932693243027,
|
|
"learning_rate": 9.962117450832225e-05,
|
|
"loss": 0.238,
|
|
"step": 48,
|
|
"step_time": 29.046616760999314
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.006842764443717897,
|
|
"clip_ratio/high_mean": 0.0034213822218589485,
|
|
"clip_ratio/low_mean": 0.0015625000232830644,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.004983882245142013,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10017.0,
|
|
"completions/max_terminated_length": 10017.0,
|
|
"completions/mean_length": 7918.40625,
|
|
"completions/mean_terminated_length": 7918.40625,
|
|
"completions/min_length": 1876.0,
|
|
"completions/min_terminated_length": 1876.0,
|
|
"entropy": 0.24847039952874184,
|
|
"epoch": 0.00049,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.1504110097885132,
|
|
"kl": 0.3213502997532487,
|
|
"learning_rate": 9.955553639824423e-05,
|
|
"loss": 0.1906,
|
|
"num_tokens": 6849638.0,
|
|
"reward": 0.39189645648002625,
|
|
"reward_std": 0.5209037065505981,
|
|
"rewards/rollout_eval_reward_func/mean": 0.2815040647983551,
|
|
"rewards/rollout_eval_reward_func/std": 0.332853227853775,
|
|
"rewards/rollout_reward_func/mean": 0.39189645648002625,
|
|
"rewards/rollout_reward_func/std": 0.5881980061531067,
|
|
"sampling/importance_sampling_ratio/max": 1.4030216932296753,
|
|
"sampling/importance_sampling_ratio/mean": 0.9992052316665649,
|
|
"sampling/importance_sampling_ratio/min": 0.6490213871002197,
|
|
"sampling/sampling_logp_difference/max": 0.43228960037231445,
|
|
"sampling/sampling_logp_difference/mean": 0.011766092851758003,
|
|
"step": 49,
|
|
"step_time": 91.98302743600289
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.030021664802916348,
|
|
"clip_ratio/high_mean": 0.01896916568512097,
|
|
"clip_ratio/low_mean": 0.02840909146470949,
|
|
"clip_ratio/low_min": 0.0031250001629814506,
|
|
"clip_ratio/region_mean": 0.04737825732445344,
|
|
"entropy": 0.22083801217377186,
|
|
"epoch": 0.0005,
|
|
"grad_norm": 1.493245005607605,
|
|
"kl": 0.6161252139136195,
|
|
"learning_rate": 9.948469046756344e-05,
|
|
"loss": 0.1882,
|
|
"step": 50,
|
|
"step_time": 29.706524382998396
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.007615459966473281,
|
|
"clip_ratio/high_mean": 0.0038077299832366407,
|
|
"clip_ratio/low_mean": 0.0010416667209938169,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0048493967042304575,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10057.0,
|
|
"completions/max_terminated_length": 10057.0,
|
|
"completions/mean_length": 7061.03125,
|
|
"completions/mean_terminated_length": 7061.03125,
|
|
"completions/min_length": 2525.0,
|
|
"completions/min_terminated_length": 2525.0,
|
|
"entropy": 0.24380221962928772,
|
|
"epoch": 0.00051,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.1951247453689575,
|
|
"kl": 0.28687382210046053,
|
|
"learning_rate": 9.940864667303489e-05,
|
|
"loss": 0.1425,
|
|
"num_tokens": 7103728.0,
|
|
"reward": 0.406146377325058,
|
|
"reward_std": 0.6755715608596802,
|
|
"rewards/rollout_eval_reward_func/mean": 0.3859247863292694,
|
|
"rewards/rollout_eval_reward_func/std": 0.33643871545791626,
|
|
"rewards/rollout_reward_func/mean": 0.406146377325058,
|
|
"rewards/rollout_reward_func/std": 0.6774359345436096,
|
|
"sampling/importance_sampling_ratio/max": 1.367674469947815,
|
|
"sampling/importance_sampling_ratio/mean": 0.9991032481193542,
|
|
"sampling/importance_sampling_ratio/min": 0.6542518734931946,
|
|
"sampling/sampling_logp_difference/max": 0.4242628812789917,
|
|
"sampling/sampling_logp_difference/mean": 0.012621527537703514,
|
|
"step": 51,
|
|
"step_time": 85.52853098199739
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.023708798456937075,
|
|
"clip_ratio/high_mean": 0.015155438333749771,
|
|
"clip_ratio/low_mean": 0.02644535672152415,
|
|
"clip_ratio/low_min": 0.009695513173937798,
|
|
"clip_ratio/region_mean": 0.04160079546272755,
|
|
"entropy": 0.24589570611715317,
|
|
"epoch": 0.00052,
|
|
"grad_norm": 0.6901561617851257,
|
|
"kl": 0.2809536149725318,
|
|
"learning_rate": 9.932741570192633e-05,
|
|
"loss": 0.1278,
|
|
"step": 52,
|
|
"step_time": 28.923457664002854
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0011160714784637094,
|
|
"clip_ratio/high_mean": 0.0005580357392318547,
|
|
"clip_ratio/low_mean": 0.0005208333604969084,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.001078869099728763,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10169.0,
|
|
"completions/max_terminated_length": 10169.0,
|
|
"completions/mean_length": 7814.28125,
|
|
"completions/mean_terminated_length": 7814.28125,
|
|
"completions/min_length": 1989.0,
|
|
"completions/min_terminated_length": 1989.0,
|
|
"entropy": 0.21275948453694582,
|
|
"epoch": 0.00053,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7513790130615234,
|
|
"kl": 0.23512054793536663,
|
|
"learning_rate": 9.924100897051629e-05,
|
|
"loss": 0.1945,
|
|
"num_tokens": 7382261.0,
|
|
"reward": 0.42753756046295166,
|
|
"reward_std": 0.49785035848617554,
|
|
"rewards/rollout_eval_reward_func/mean": 0.26969003677368164,
|
|
"rewards/rollout_eval_reward_func/std": 0.3341839611530304,
|
|
"rewards/rollout_reward_func/mean": 0.42753756046295166,
|
|
"rewards/rollout_reward_func/std": 0.49307680130004883,
|
|
"sampling/importance_sampling_ratio/max": 1.3325144052505493,
|
|
"sampling/importance_sampling_ratio/mean": 0.9995752573013306,
|
|
"sampling/importance_sampling_ratio/min": 0.6147154569625854,
|
|
"sampling/sampling_logp_difference/max": 0.48659586906433105,
|
|
"sampling/sampling_logp_difference/mean": 0.010477245785295963,
|
|
"step": 53,
|
|
"step_time": 89.77598898800352
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.014756215270608664,
|
|
"clip_ratio/high_mean": 0.007378107635304332,
|
|
"clip_ratio/low_mean": 0.026041667733807117,
|
|
"clip_ratio/low_min": 0.008333333651535213,
|
|
"clip_ratio/region_mean": 0.03341977560194209,
|
|
"entropy": 0.20128578413277864,
|
|
"epoch": 0.00054,
|
|
"grad_norm": 0.570249080657959,
|
|
"kl": 0.24723996873944998,
|
|
"learning_rate": 9.914943862248966e-05,
|
|
"loss": 0.1836,
|
|
"step": 54,
|
|
"step_time": 28.66781206799169
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.005409664008766413,
|
|
"clip_ratio/high_mean": 0.0027048320043832064,
|
|
"clip_ratio/low_mean": 0.0005208333604969084,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.003225665364880115,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9612.0,
|
|
"completions/max_terminated_length": 9612.0,
|
|
"completions/mean_length": 7466.40625,
|
|
"completions/mean_terminated_length": 7466.40625,
|
|
"completions/min_length": 897.0,
|
|
"completions/min_terminated_length": 897.0,
|
|
"entropy": 0.2242852784693241,
|
|
"epoch": 0.00055,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7776551246643066,
|
|
"kl": 0.22325131320394576,
|
|
"learning_rate": 9.905271752723088e-05,
|
|
"loss": 0.0206,
|
|
"num_tokens": 7648812.0,
|
|
"reward": 0.40199464559555054,
|
|
"reward_std": 0.5598407983779907,
|
|
"rewards/rollout_eval_reward_func/mean": 0.3375253975391388,
|
|
"rewards/rollout_eval_reward_func/std": 0.351279616355896,
|
|
"rewards/rollout_reward_func/mean": 0.40199464559555054,
|
|
"rewards/rollout_reward_func/std": 0.5975609421730042,
|
|
"sampling/importance_sampling_ratio/max": 1.317135214805603,
|
|
"sampling/importance_sampling_ratio/mean": 0.9976714849472046,
|
|
"sampling/importance_sampling_ratio/min": 0.6417545676231384,
|
|
"sampling/sampling_logp_difference/max": 0.4435492753982544,
|
|
"sampling/sampling_logp_difference/mean": 0.012365585193037987,
|
|
"step": 55,
|
|
"step_time": 90.48158546899867
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.02967093954794109,
|
|
"clip_ratio/high_mean": 0.01639796979725361,
|
|
"clip_ratio/low_mean": 0.017361111822538078,
|
|
"clip_ratio/low_min": 0.0031250001629814506,
|
|
"clip_ratio/region_mean": 0.03375908185262233,
|
|
"entropy": 0.2281673550605774,
|
|
"epoch": 0.00056,
|
|
"grad_norm": 0.48372626304626465,
|
|
"kl": 0.23605143558233976,
|
|
"learning_rate": 9.895085927801542e-05,
|
|
"loss": 0.0086,
|
|
"step": 56,
|
|
"step_time": 27.291444884000157
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.003557769814506173,
|
|
"clip_ratio/high_mean": 0.0017788849072530866,
|
|
"clip_ratio/low_mean": 0.0015625000814907253,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.003341384930536151,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10322.0,
|
|
"completions/max_terminated_length": 10322.0,
|
|
"completions/mean_length": 6645.3125,
|
|
"completions/mean_terminated_length": 6645.3125,
|
|
"completions/min_length": 1995.0,
|
|
"completions/min_terminated_length": 1995.0,
|
|
"entropy": 0.22339679207652807,
|
|
"epoch": 0.00057,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7947802543640137,
|
|
"kl": 0.33472199738025665,
|
|
"learning_rate": 9.884387819009922e-05,
|
|
"loss": 0.0286,
|
|
"num_tokens": 7889241.0,
|
|
"reward": 0.3272937536239624,
|
|
"reward_std": 0.7356898784637451,
|
|
"rewards/rollout_eval_reward_func/mean": 0.38528963923454285,
|
|
"rewards/rollout_eval_reward_func/std": 0.3158987760543823,
|
|
"rewards/rollout_reward_func/mean": 0.3272937536239624,
|
|
"rewards/rollout_reward_func/std": 0.7287615537643433,
|
|
"sampling/importance_sampling_ratio/max": 1.519856333732605,
|
|
"sampling/importance_sampling_ratio/mean": 1.0008394718170166,
|
|
"sampling/importance_sampling_ratio/min": 0.6888355612754822,
|
|
"sampling/sampling_logp_difference/max": 0.41861581802368164,
|
|
"sampling/sampling_logp_difference/mean": 0.01188460923731327,
|
|
"step": 57,
|
|
"step_time": 83.6079965079989
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.02337649872060865,
|
|
"clip_ratio/high_mean": 0.012729916197713464,
|
|
"clip_ratio/low_mean": 0.03550771565642208,
|
|
"clip_ratio/low_min": 0.013886852888390422,
|
|
"clip_ratio/region_mean": 0.048237632028758526,
|
|
"entropy": 0.23247116059064865,
|
|
"epoch": 0.00058,
|
|
"grad_norm": 0.6895915269851685,
|
|
"kl": 0.30278117302805185,
|
|
"learning_rate": 9.873178929870695e-05,
|
|
"loss": 0.0178,
|
|
"step": 58,
|
|
"step_time": 29.01562165299947
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.006458333344198763,
|
|
"clip_ratio/high_mean": 0.00375000003259629,
|
|
"clip_ratio/low_mean": 0.0010416667209938169,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.004791666753590107,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10121.0,
|
|
"completions/max_terminated_length": 10121.0,
|
|
"completions/mean_length": 7354.9375,
|
|
"completions/mean_terminated_length": 7354.9375,
|
|
"completions/min_length": 1114.0,
|
|
"completions/min_terminated_length": 1114.0,
|
|
"entropy": 0.2855970785021782,
|
|
"epoch": 0.00059,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.1581138372421265,
|
|
"kl": 0.30998079385608435,
|
|
"learning_rate": 9.86146083569188e-05,
|
|
"loss": -0.077,
|
|
"num_tokens": 8152533.0,
|
|
"reward": 0.12930195033550262,
|
|
"reward_std": 0.6660091876983643,
|
|
"rewards/rollout_eval_reward_func/mean": 0.33841466903686523,
|
|
"rewards/rollout_eval_reward_func/std": 0.3268774151802063,
|
|
"rewards/rollout_reward_func/mean": 0.12930195033550262,
|
|
"rewards/rollout_reward_func/std": 0.7711123824119568,
|
|
"sampling/importance_sampling_ratio/max": 1.4381568431854248,
|
|
"sampling/importance_sampling_ratio/mean": 0.9980136156082153,
|
|
"sampling/importance_sampling_ratio/min": 0.7020198106765747,
|
|
"sampling/sampling_logp_difference/max": 0.36336231231689453,
|
|
"sampling/sampling_logp_difference/mean": 0.016959059983491898,
|
|
"step": 59,
|
|
"step_time": 87.87077508199764
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.048061754438094795,
|
|
"clip_ratio/high_mean": 0.031483913655392826,
|
|
"clip_ratio/low_mean": 0.04418836906552315,
|
|
"clip_ratio/low_min": 0.007291666814126074,
|
|
"clip_ratio/region_mean": 0.07567228260450065,
|
|
"entropy": 0.26963882334530354,
|
|
"epoch": 0.0006,
|
|
"grad_norm": 1.0022964477539062,
|
|
"kl": 0.30027929320931435,
|
|
"learning_rate": 9.84923518334567e-05,
|
|
"loss": -0.0828,
|
|
"step": 60,
|
|
"step_time": 28.71259851099967
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.01005121401976794,
|
|
"clip_ratio/high_mean": 0.005546440428588539,
|
|
"clip_ratio/low_mean": 0.0020026409183628857,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.007549081346951425,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10379.0,
|
|
"completions/max_terminated_length": 10379.0,
|
|
"completions/mean_length": 6955.875,
|
|
"completions/mean_terminated_length": 6955.875,
|
|
"completions/min_length": 2081.0,
|
|
"completions/min_terminated_length": 2081.0,
|
|
"entropy": 0.23451983137056231,
|
|
"epoch": 0.00061,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.1702102422714233,
|
|
"kl": 0.28609952982515097,
|
|
"learning_rate": 9.83650369103696e-05,
|
|
"loss": 0.0631,
|
|
"num_tokens": 8403186.0,
|
|
"reward": 0.3940112888813019,
|
|
"reward_std": 0.67255699634552,
|
|
"rewards/rollout_eval_reward_func/mean": 0.3715701103210449,
|
|
"rewards/rollout_eval_reward_func/std": 0.3261474072933197,
|
|
"rewards/rollout_reward_func/mean": 0.3940112888813019,
|
|
"rewards/rollout_reward_func/std": 0.6762000322341919,
|
|
"sampling/importance_sampling_ratio/max": 1.3093942403793335,
|
|
"sampling/importance_sampling_ratio/mean": 1.0008020401000977,
|
|
"sampling/importance_sampling_ratio/min": 0.5961512923240662,
|
|
"sampling/sampling_logp_difference/max": 0.5172607898712158,
|
|
"sampling/sampling_logp_difference/mean": 0.014042183756828308,
|
|
"step": 61,
|
|
"step_time": 86.51144317899707
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.051156656933017075,
|
|
"clip_ratio/high_mean": 0.03779221937293187,
|
|
"clip_ratio/low_mean": 0.05211732583120465,
|
|
"clip_ratio/low_min": 0.024354460649192333,
|
|
"clip_ratio/region_mean": 0.08990954549517483,
|
|
"entropy": 0.21548824943602085,
|
|
"epoch": 0.00062,
|
|
"grad_norm": 1.1680642366409302,
|
|
"kl": 0.5453370595350862,
|
|
"learning_rate": 9.823268148061883e-05,
|
|
"loss": 0.0666,
|
|
"step": 62,
|
|
"step_time": 28.28093677799916
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.009642903693020344,
|
|
"clip_ratio/high_mean": 0.004821451846510172,
|
|
"clip_ratio/low_mean": 0.0010416667209938169,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.005863118567503989,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9611.0,
|
|
"completions/max_terminated_length": 9611.0,
|
|
"completions/mean_length": 5474.5,
|
|
"completions/mean_terminated_length": 5474.5,
|
|
"completions/min_length": 1264.0,
|
|
"completions/min_terminated_length": 1264.0,
|
|
"entropy": 0.20994199626147747,
|
|
"epoch": 0.00063,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.5162609815597534,
|
|
"kl": 0.3376044826582074,
|
|
"learning_rate": 9.809530414556335e-05,
|
|
"loss": 0.1386,
|
|
"num_tokens": 8606212.0,
|
|
"reward": 0.6158473491668701,
|
|
"reward_std": 0.705132782459259,
|
|
"rewards/rollout_eval_reward_func/mean": 0.5119410753250122,
|
|
"rewards/rollout_eval_reward_func/std": 0.2654803693294525,
|
|
"rewards/rollout_reward_func/mean": 0.6158473491668701,
|
|
"rewards/rollout_reward_func/std": 0.6767383813858032,
|
|
"sampling/importance_sampling_ratio/max": 1.9123412370681763,
|
|
"sampling/importance_sampling_ratio/mean": 0.9994137287139893,
|
|
"sampling/importance_sampling_ratio/min": 0.6006231904029846,
|
|
"sampling/sampling_logp_difference/max": 0.6483283042907715,
|
|
"sampling/sampling_logp_difference/mean": 0.015111252665519714,
|
|
"step": 63,
|
|
"step_time": 74.64896667399807
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.05132549628615379,
|
|
"clip_ratio/high_mean": 0.030718339723534882,
|
|
"clip_ratio/low_mean": 0.028882576967589557,
|
|
"clip_ratio/low_min": 0.0031250000465661287,
|
|
"clip_ratio/region_mean": 0.05960091657470912,
|
|
"entropy": 0.20065013086423278,
|
|
"epoch": 0.00064,
|
|
"grad_norm": 1.244667649269104,
|
|
"kl": 0.453593029640615,
|
|
"learning_rate": 9.79529242123455e-05,
|
|
"loss": 0.1234,
|
|
"step": 64,
|
|
"step_time": 24.8986287849948
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0077537596225738525,
|
|
"clip_ratio/high_mean": 0.0038768798112869263,
|
|
"clip_ratio/low_mean": 0.0005208333604969084,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.004397713171783835,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10299.0,
|
|
"completions/max_terminated_length": 10299.0,
|
|
"completions/mean_length": 6500.03125,
|
|
"completions/mean_terminated_length": 6500.03125,
|
|
"completions/min_length": 1712.0,
|
|
"completions/min_terminated_length": 1712.0,
|
|
"entropy": 0.14482268318533897,
|
|
"epoch": 0.00065,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.9947667121887207,
|
|
"kl": 0.2627185983583331,
|
|
"learning_rate": 9.780556169117757e-05,
|
|
"loss": 0.0665,
|
|
"num_tokens": 8841902.0,
|
|
"reward": 0.6678237915039062,
|
|
"reward_std": 0.5866862535476685,
|
|
"rewards/rollout_eval_reward_func/mean": 0.5013973712921143,
|
|
"rewards/rollout_eval_reward_func/std": 0.27832266688346863,
|
|
"rewards/rollout_reward_func/mean": 0.6678237915039062,
|
|
"rewards/rollout_reward_func/std": 0.5921808481216431,
|
|
"sampling/importance_sampling_ratio/max": 1.4597694873809814,
|
|
"sampling/importance_sampling_ratio/mean": 0.99915611743927,
|
|
"sampling/importance_sampling_ratio/min": 0.27695003151893616,
|
|
"sampling/sampling_logp_difference/max": 1.2839181423187256,
|
|
"sampling/sampling_logp_difference/mean": 0.010844534263014793,
|
|
"step": 65,
|
|
"step_time": 80.86000475000401
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.025044884881936014,
|
|
"clip_ratio/high_mean": 0.014345359115395695,
|
|
"clip_ratio/low_mean": 0.02013494382845238,
|
|
"clip_ratio/low_min": 0.0020833334419876337,
|
|
"clip_ratio/region_mean": 0.03448030271101743,
|
|
"entropy": 0.13131517032161355,
|
|
"epoch": 0.00066,
|
|
"grad_norm": 0.4750834107398987,
|
|
"kl": 0.34219094878062606,
|
|
"learning_rate": 9.765323729252955e-05,
|
|
"loss": 0.0561,
|
|
"step": 66,
|
|
"step_time": 28.661124781996477
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.009476827806793153,
|
|
"clip_ratio/high_mean": 0.0062825315981172025,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0062825315981172025,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10269.0,
|
|
"completions/max_terminated_length": 10269.0,
|
|
"completions/mean_length": 6501.5,
|
|
"completions/mean_terminated_length": 6501.5,
|
|
"completions/min_length": 724.0,
|
|
"completions/min_terminated_length": 724.0,
|
|
"entropy": 0.14845013478770852,
|
|
"epoch": 0.00067,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.9048762917518616,
|
|
"kl": 0.3337597157806158,
|
|
"learning_rate": 9.749597242421838e-05,
|
|
"loss": 0.0677,
|
|
"num_tokens": 9077833.0,
|
|
"reward": 0.6164548397064209,
|
|
"reward_std": 0.5683386325836182,
|
|
"rewards/rollout_eval_reward_func/mean": 0.4744664430618286,
|
|
"rewards/rollout_eval_reward_func/std": 0.29613611102104187,
|
|
"rewards/rollout_reward_func/mean": 0.6164548397064209,
|
|
"rewards/rollout_reward_func/std": 0.6236394643783569,
|
|
"sampling/importance_sampling_ratio/max": 1.9927904605865479,
|
|
"sampling/importance_sampling_ratio/mean": 1.0013047456741333,
|
|
"sampling/importance_sampling_ratio/min": 0.5228504538536072,
|
|
"sampling/sampling_logp_difference/max": 0.6895358562469482,
|
|
"sampling/sampling_logp_difference/mean": 0.011342051438987255,
|
|
"step": 67,
|
|
"step_time": 79.70687562199964
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0376884457655251,
|
|
"clip_ratio/high_mean": 0.026128767582122236,
|
|
"clip_ratio/low_mean": 0.026416301843710244,
|
|
"clip_ratio/low_min": 0.007291666814126074,
|
|
"clip_ratio/region_mean": 0.0525450695422478,
|
|
"entropy": 0.15412914380431175,
|
|
"epoch": 0.00068,
|
|
"grad_norm": 0.8491650223731995,
|
|
"kl": 0.3899666126817465,
|
|
"learning_rate": 9.733378918839942e-05,
|
|
"loss": 0.0638,
|
|
"step": 68,
|
|
"step_time": 27.40538086699962
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.006514550419524312,
|
|
"clip_ratio/high_mean": 0.003257275209762156,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.003257275209762156,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10202.0,
|
|
"completions/max_terminated_length": 10202.0,
|
|
"completions/mean_length": 5860.65625,
|
|
"completions/mean_terminated_length": 5860.65625,
|
|
"completions/min_length": 540.0,
|
|
"completions/min_terminated_length": 540.0,
|
|
"entropy": 0.16269859950989485,
|
|
"epoch": 0.00069,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.9143983721733093,
|
|
"kl": 0.47777214366942644,
|
|
"learning_rate": 9.716671037846007e-05,
|
|
"loss": 0.1152,
|
|
"num_tokens": 9293397.0,
|
|
"reward": 0.6956244707107544,
|
|
"reward_std": 0.5506021976470947,
|
|
"rewards/rollout_eval_reward_func/mean": 0.5125762224197388,
|
|
"rewards/rollout_eval_reward_func/std": 0.2857610881328583,
|
|
"rewards/rollout_reward_func/mean": 0.6956244707107544,
|
|
"rewards/rollout_reward_func/std": 0.5588130354881287,
|
|
"sampling/importance_sampling_ratio/max": 1.4407436847686768,
|
|
"sampling/importance_sampling_ratio/mean": 1.0004699230194092,
|
|
"sampling/importance_sampling_ratio/min": 0.5672728419303894,
|
|
"sampling/sampling_logp_difference/max": 0.5669147968292236,
|
|
"sampling/sampling_logp_difference/mean": 0.010705020278692245,
|
|
"step": 69,
|
|
"step_time": 77.93740563500614
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.04634982522111386,
|
|
"clip_ratio/high_mean": 0.029493737209122628,
|
|
"clip_ratio/low_mean": 0.01730769290588796,
|
|
"clip_ratio/low_min": 0.004166666767559946,
|
|
"clip_ratio/region_mean": 0.046801429823972285,
|
|
"entropy": 0.1784980888478458,
|
|
"epoch": 0.0007,
|
|
"grad_norm": 0.7063129544258118,
|
|
"kl": 0.3514184970408678,
|
|
"learning_rate": 9.699475947581644e-05,
|
|
"loss": 0.1049,
|
|
"step": 70,
|
|
"step_time": 27.06573885999751
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0018382353009656072,
|
|
"clip_ratio/high_mean": 0.0009191176504828036,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0009191176504828036,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10593.0,
|
|
"completions/max_terminated_length": 10593.0,
|
|
"completions/mean_length": 6225.28125,
|
|
"completions/mean_terminated_length": 6225.28125,
|
|
"completions/min_length": 1544.0,
|
|
"completions/min_terminated_length": 1544.0,
|
|
"entropy": 0.17604797054082155,
|
|
"epoch": 0.00071,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 0.7393732070922852,
|
|
"kl": 0.20170354284346104,
|
|
"learning_rate": 9.681796064661319e-05,
|
|
"loss": 0.0413,
|
|
"num_tokens": 9520372.0,
|
|
"reward": 0.8103519678115845,
|
|
"reward_std": 0.3980957269668579,
|
|
"rewards/rollout_eval_reward_func/mean": 0.5907012224197388,
|
|
"rewards/rollout_eval_reward_func/std": 0.20860876142978668,
|
|
"rewards/rollout_reward_func/mean": 0.8103519678115845,
|
|
"rewards/rollout_reward_func/std": 0.49828964471817017,
|
|
"sampling/importance_sampling_ratio/max": 1.5257266759872437,
|
|
"sampling/importance_sampling_ratio/mean": 0.9991195201873779,
|
|
"sampling/importance_sampling_ratio/min": 0.6470949649810791,
|
|
"sampling/sampling_logp_difference/max": 0.43526220321655273,
|
|
"sampling/sampling_logp_difference/mean": 0.010150602087378502,
|
|
"step": 71,
|
|
"step_time": 79.67722541299918
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.01454339677002281,
|
|
"clip_ratio/high_mean": 0.008729609136935323,
|
|
"clip_ratio/low_mean": 0.009114583488553762,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.01784419280011207,
|
|
"entropy": 0.18162205442786217,
|
|
"epoch": 0.00072,
|
|
"grad_norm": 0.44046881794929504,
|
|
"kl": 0.20182663016021252,
|
|
"learning_rate": 9.663633873832725e-05,
|
|
"loss": 0.0328,
|
|
"step": 72,
|
|
"step_time": 28.538212690000364
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.004232634324580431,
|
|
"clip_ratio/high_mean": 0.0021163171622902155,
|
|
"clip_ratio/low_mean": 0.0005208333604969084,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.002637150522787124,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 10142.0,
|
|
"completions/max_terminated_length": 10142.0,
|
|
"completions/mean_length": 6840.59375,
|
|
"completions/mean_terminated_length": 6840.59375,
|
|
"completions/min_length": 1013.0,
|
|
"completions/min_terminated_length": 1013.0,
|
|
"entropy": 0.20604060776531696,
|
|
"epoch": 0.00073,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6912176609039307,
|
|
"kl": 0.26183000626042485,
|
|
"learning_rate": 9.644991927627566e-05,
|
|
"loss": -0.0088,
|
|
"num_tokens": 9767000.0,
|
|
"reward": 0.7756590247154236,
|
|
"reward_std": 0.5361872911453247,
|
|
"rewards/rollout_eval_reward_func/mean": 0.5909552574157715,
|
|
"rewards/rollout_eval_reward_func/std": 0.2465948760509491,
|
|
"rewards/rollout_reward_func/mean": 0.7756590247154236,
|
|
"rewards/rollout_reward_func/std": 0.5321318507194519,
|
|
"sampling/importance_sampling_ratio/max": 1.2645851373672485,
|
|
"sampling/importance_sampling_ratio/mean": 1.0009121894836426,
|
|
"sampling/importance_sampling_ratio/min": 0.6386132836341858,
|
|
"sampling/sampling_logp_difference/max": 0.4484562873840332,
|
|
"sampling/sampling_logp_difference/mean": 0.010102368891239166,
|
|
"step": 73,
|
|
"step_time": 82.14820753000458
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.02557993505615741,
|
|
"clip_ratio/high_mean": 0.01748599053826183,
|
|
"clip_ratio/low_mean": 0.009895833674818277,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.027381824154872447,
|
|
"entropy": 0.22169003915041685,
|
|
"epoch": 0.00074,
|
|
"grad_norm": 0.42521047592163086,
|
|
"kl": 0.23322301171720028,
|
|
"learning_rate": 9.625872846002834e-05,
|
|
"loss": -0.0155,
|
|
"step": 74,
|
|
"step_time": 28.134478513999056
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.008986742584966123,
|
|
"clip_ratio/high_mean": 0.005014204594772309,
|
|
"clip_ratio/low_mean": 0.002018229220993817,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.007032433815766126,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9857.0,
|
|
"completions/max_terminated_length": 9857.0,
|
|
"completions/mean_length": 7195.375,
|
|
"completions/mean_terminated_length": 7195.375,
|
|
"completions/min_length": 768.0,
|
|
"completions/min_terminated_length": 768.0,
|
|
"entropy": 0.27833056077361107,
|
|
"epoch": 0.00075,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.8185970187187195,
|
|
"kl": 0.24367811996489763,
|
|
"learning_rate": 9.606279315972582e-05,
|
|
"loss": -0.1492,
|
|
"num_tokens": 10025024.0,
|
|
"reward": 0.2978099584579468,
|
|
"reward_std": 0.6597497463226318,
|
|
"rewards/rollout_eval_reward_func/mean": 0.32901421189308167,
|
|
"rewards/rollout_eval_reward_func/std": 0.3157320022583008,
|
|
"rewards/rollout_reward_func/mean": 0.2978099584579468,
|
|
"rewards/rollout_reward_func/std": 0.690675675868988,
|
|
"sampling/importance_sampling_ratio/max": 1.4156017303466797,
|
|
"sampling/importance_sampling_ratio/mean": 1.0000808238983154,
|
|
"sampling/importance_sampling_ratio/min": 0.6558278799057007,
|
|
"sampling/sampling_logp_difference/max": 0.4218568801879883,
|
|
"sampling/sampling_logp_difference/mean": 0.013327672146260738,
|
|
"step": 75,
|
|
"step_time": 88.96669697499601
|
|
},
|
|
{
|
|
"epoch": 0.00075,
|
|
"eval_clip_ratio/high_max": 0.0,
|
|
"eval_clip_ratio/high_mean": 0.0,
|
|
"eval_clip_ratio/low_mean": 0.0,
|
|
"eval_clip_ratio/low_min": 0.0,
|
|
"eval_clip_ratio/region_mean": 0.0,
|
|
"eval_completions/clipped_ratio": 0.0,
|
|
"eval_completions/max_length": 9194.0,
|
|
"eval_completions/max_terminated_length": 9194.0,
|
|
"eval_completions/mean_length": 7026.0375,
|
|
"eval_completions/mean_terminated_length": 7026.0375,
|
|
"eval_completions/min_length": 4333.95,
|
|
"eval_completions/min_terminated_length": 4333.95,
|
|
"eval_entropy": 0.3085056647658348,
|
|
"eval_frac_reward_zero_std": 1.0,
|
|
"eval_kl": 0.22236853390932082,
|
|
"eval_loss": 0.0002063037100015208,
|
|
"eval_num_tokens": 10025024.0,
|
|
"eval_reward": 0.35444250535219907,
|
|
"eval_reward_std": 0.0,
|
|
"eval_rewards/rollout_eval_reward_func/mean": 0.3484247986227274,
|
|
"eval_rewards/rollout_eval_reward_func/std": 0.26531881298869847,
|
|
"eval_rewards/rollout_reward_func/mean": 0.35444250535219907,
|
|
"eval_rewards/rollout_reward_func/std": 0.5791118375957012,
|
|
"eval_runtime": 161.4965,
|
|
"eval_samples_per_second": 0.062,
|
|
"eval_sampling/importance_sampling_ratio/max": 1.1964155852794647,
|
|
"eval_sampling/importance_sampling_ratio/mean": 1.0003154128789902,
|
|
"eval_sampling/importance_sampling_ratio/min": 0.7968822807073593,
|
|
"eval_sampling/sampling_logp_difference/max": 0.2617991387844086,
|
|
"eval_sampling/sampling_logp_difference/mean": 0.01210988024249673,
|
|
"eval_steps_per_second": 0.019,
|
|
"step": 75
|
|
}
|
|
],
|
|
"logging_steps": 1.0,
|
|
"max_steps": 300,
|
|
"num_input_tokens_seen": 10025024,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|