61292 lines
2.1 MiB
61292 lines
2.1 MiB
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 7.997849462365592,
|
|
"eval_steps": 500,
|
|
"global_step": 2784,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.022431484925773e-09,
|
|
"advantages/std": 0.6185691356658936,
|
|
"advantages/var": 0.3826277755984506,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.002867383512544803,
|
|
"grad_norm": 0.12072640104660734,
|
|
"learning_rate": 2e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 87679.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.18543373048305511,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7815011540266774e-09,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.005734767025089606,
|
|
"grad_norm": 0.11813835707013225,
|
|
"learning_rate": 1.999999366948742e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 170424.0,
|
|
"reward": 0.5703125,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.5703125,
|
|
"rewards/drgrpo_math_reward/std": 0.4969765841960907,
|
|
"step": 2
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.979305421724224e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 0.008602150537634409,
|
|
"grad_norm": 0.08307332827566753,
|
|
"learning_rate": 1.9999974677957702e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 263226.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 3
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.011469534050179211,
|
|
"grad_norm": 0.07731912706360194,
|
|
"learning_rate": 1.9999943025434887e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 344864.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 4
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 7.31812171195528e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.59375,
|
|
"epoch": 0.014336917562724014,
|
|
"grad_norm": 0.09771629934875779,
|
|
"learning_rate": 1.9999898711959057e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 453394.0,
|
|
"reward": 0.453125,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.453125,
|
|
"rewards/drgrpo_math_reward/std": 0.4997538626194,
|
|
"step": 5
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.2583933094613625e-09,
|
|
"advantages/std": 0.6185742020606995,
|
|
"advantages/var": 0.38263404345503105,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 0.017204301075268817,
|
|
"grad_norm": 0.1342631914269599,
|
|
"learning_rate": 1.999984173758631e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 545520.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.19438527524471283,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 6
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.02007168458781362,
|
|
"grad_norm": 0.12138951673049583,
|
|
"learning_rate": 1.999977210238878e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 634042.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 7
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.4083979969389336e-09,
|
|
"advantages/std": 0.6612637639045715,
|
|
"advantages/var": 0.4372697654532409,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.022939068100358423,
|
|
"grad_norm": 0.23519275858410402,
|
|
"learning_rate": 1.999968980645464e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 720467.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.1938612163066864,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 8
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 9.786317257174392e-09,
|
|
"advantages/std": 0.6185775995254517,
|
|
"advantages/var": 0.38263824663467005,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 0.025806451612903226,
|
|
"grad_norm": 0.0988104248714376,
|
|
"learning_rate": 1.9999594849888084e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 823071.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.19674429297447205,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 9
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.5995914574054795e-09,
|
|
"advantages/std": 0.4049588143825531,
|
|
"advantages/var": 0.1639916413461231,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 0.02867383512544803,
|
|
"grad_norm": 0.07287546496174245,
|
|
"learning_rate": 1.9999487232809332e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 919261.0,
|
|
"reward": 0.609375,
|
|
"reward_std": 0.0867956355214119,
|
|
"rewards/drgrpo_math_reward/mean": 0.609375,
|
|
"rewards/drgrpo_math_reward/std": 0.4898075461387634,
|
|
"step": 10
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9875209889720355e-09,
|
|
"advantages/std": 0.4676063656806946,
|
|
"advantages/var": 0.21865571322510746,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.031541218637992835,
|
|
"grad_norm": 0.07623329608908455,
|
|
"learning_rate": 1.9999366955354637e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 1004963.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.11913755536079407,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 11
|
|
},
|
|
{
|
|
"advantages/mean": 6.05359673500061e-09,
|
|
"advantages/snr": 9.786512459149173e-09,
|
|
"advantages/std": 0.6185652613639832,
|
|
"advantages/var": 0.3826229825662928,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.034408602150537634,
|
|
"grad_norm": 0.1237469953585538,
|
|
"learning_rate": 1.999923401767629e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 1090102.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.1820138692855835,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 12
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.1266932113463096e-08,
|
|
"advantages/std": 0.6612785458564758,
|
|
"advantages/var": 0.4372893152100552,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 0.03727598566308244,
|
|
"grad_norm": 0.1434758279295005,
|
|
"learning_rate": 1.9999088419942594e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 1185270.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.21436068415641785,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 13
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917033813576203e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 0.04014336917562724,
|
|
"grad_norm": 0.12193245342510368,
|
|
"learning_rate": 1.99989301623379e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 1278859.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 14
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262492693233955e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.043010752688172046,
|
|
"grad_norm": 0.13613144621465645,
|
|
"learning_rate": 1.999875924506258e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 1378423.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 15
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962749759103603e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.045878136200716846,
|
|
"grad_norm": 0.052480010681470804,
|
|
"learning_rate": 1.999857566833302e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 1466568.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 16
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 7.528086507037825e-10,
|
|
"advantages/std": 0.6185652613639832,
|
|
"advantages/var": 0.3826229825662928,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 0.04874551971326165,
|
|
"grad_norm": 0.08619603417965296,
|
|
"learning_rate": 1.9998379432381658e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 1561456.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1820138692855835,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 17
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.2351708626582845e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.05161290322580645,
|
|
"grad_norm": 0.07287898456293838,
|
|
"learning_rate": 1.999817053745694e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 1650097.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 18
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.6261990006727604e-09,
|
|
"advantages/std": 0.5726990103721619,
|
|
"advantages/var": 0.32798415648125356,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 0.05448028673835126,
|
|
"grad_norm": 0.21959986144796612,
|
|
"learning_rate": 1.999794898382336e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 1757617.0,
|
|
"reward": 0.3984375,
|
|
"reward_std": 0.17806214094161987,
|
|
"rewards/drgrpo_math_reward/mean": 0.3984375,
|
|
"rewards/drgrpo_math_reward/std": 0.4915000796318054,
|
|
"step": 19
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.0224895169745475e-09,
|
|
"advantages/std": 0.618563175201416,
|
|
"advantages/var": 0.3826204017152577,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 0.05734767025089606,
|
|
"grad_norm": 0.10157356395625096,
|
|
"learning_rate": 1.999771477176142e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 1850850.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.17859892547130585,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 20
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 0.060215053763440864,
|
|
"grad_norm": 0.036696350533152414,
|
|
"learning_rate": 1.9997467901567657e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 1928908.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 21
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016648251989223e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 0.06308243727598567,
|
|
"grad_norm": 0.11866636461690809,
|
|
"learning_rate": 1.9997208373554635e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 2015022.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 22
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125846186210722e-09,
|
|
"advantages/std": 0.5227856636047363,
|
|
"advantages/var": 0.27330485007064453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.06594982078853047,
|
|
"grad_norm": 4.854693100648363,
|
|
"learning_rate": 1.9996936188050943e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 2093001.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.13098980486392975,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 23
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 2.816728712284573e-09,
|
|
"advantages/std": 0.661279559135437,
|
|
"advantages/var": 0.43729065533035794,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.421875,
|
|
"epoch": 0.06881720430107527,
|
|
"grad_norm": 0.11279722346323875,
|
|
"learning_rate": 1.9996651345401195e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 2194124.0,
|
|
"reward": 0.5234375,
|
|
"reward_std": 0.21648234128952026,
|
|
"rewards/drgrpo_math_reward/mean": 0.5234375,
|
|
"rewards/drgrpo_math_reward/std": 0.5014128684997559,
|
|
"step": 24
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7814329083993624e-09,
|
|
"advantages/std": 0.5227940678596497,
|
|
"advantages/var": 0.27331363738923997,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 0.07168458781362007,
|
|
"grad_norm": 0.12123531756465056,
|
|
"learning_rate": 1.999635384596603e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 2289186.0,
|
|
"reward": 0.5234375,
|
|
"reward_std": 0.1433563083410263,
|
|
"rewards/drgrpo_math_reward/mean": 0.5234375,
|
|
"rewards/drgrpo_math_reward/std": 0.5014128684997559,
|
|
"step": 25
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016467281190481e-09,
|
|
"advantages/std": 0.5227928161621094,
|
|
"advantages/var": 0.2733123286307091,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.671875,
|
|
"epoch": 0.07455197132616488,
|
|
"grad_norm": 0.09411124196600001,
|
|
"learning_rate": 1.9996043690122116e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 2376485.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.1412346363067627,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 26
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.6185722351074219,
|
|
"advantages/var": 0.3826316100457916,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 0.07741935483870968,
|
|
"grad_norm": 0.11120568828847453,
|
|
"learning_rate": 1.999572087826214e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 2466063.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.19097033143043518,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 27
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 0.08028673835125448,
|
|
"grad_norm": 0.09857405192647845,
|
|
"learning_rate": 1.9995385410794814e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 2544984.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 28
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 9.78663788288456e-09,
|
|
"advantages/std": 0.618557333946228,
|
|
"advantages/var": 0.38261317537866546,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 0.08315412186379928,
|
|
"grad_norm": 0.13390586710223532,
|
|
"learning_rate": 1.999503728814488e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 2650054.0,
|
|
"reward": 0.59375,
|
|
"reward_std": 0.17176413536071777,
|
|
"rewards/drgrpo_math_reward/mean": 0.59375,
|
|
"rewards/drgrpo_math_reward/std": 0.4930621087551117,
|
|
"step": 29
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4676070809364319,
|
|
"advantages/var": 0.21865638214189076,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 0.08602150537634409,
|
|
"grad_norm": 0.07979029691529577,
|
|
"learning_rate": 1.9994676510753086e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 2748901.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.12019839137792587,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 30
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 7.528111170810656e-10,
|
|
"advantages/std": 0.6185632348060608,
|
|
"advantages/var": 0.3826204754537379,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 0.08888888888888889,
|
|
"grad_norm": 0.10099304220854324,
|
|
"learning_rate": 1.9994303079076223e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 2850862.0,
|
|
"reward": 0.5234375,
|
|
"reward_std": 0.17859891057014465,
|
|
"rewards/drgrpo_math_reward/mean": 0.5234375,
|
|
"rewards/drgrpo_math_reward/std": 0.5014128684997559,
|
|
"step": 31
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.2524139108748537e-09,
|
|
"advantages/std": 0.5726962089538574,
|
|
"advantages/var": 0.3279809477501203,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.09175627240143369,
|
|
"grad_norm": 0.09555584445187362,
|
|
"learning_rate": 1.999391699358709e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 2946035.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.17358636856079102,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 32
|
|
},
|
|
{
|
|
"advantages/mean": 5.122274160385132e-09,
|
|
"advantages/snr": 9.797729076606565e-09,
|
|
"advantages/std": 0.5228021740913391,
|
|
"advantages/var": 0.27332211323463085,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.09462365591397849,
|
|
"grad_norm": 0.095240391953869,
|
|
"learning_rate": 1.9993518254774516e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 3030677.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.15148437023162842,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 33
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016611691573306e-09,
|
|
"advantages/std": 0.5227833986282349,
|
|
"advantages/var": 0.2733024818812879,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.0974910394265233,
|
|
"grad_norm": 0.09573900444961869,
|
|
"learning_rate": 1.9993106863143336e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 3114495.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.13098490238189697,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 34
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.4251693997307513e-08,
|
|
"advantages/std": 0.5227856040000916,
|
|
"advantages/var": 0.27330478774974054,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.1003584229390681,
|
|
"grad_norm": 0.09358643008271235,
|
|
"learning_rate": 1.9992682819214415e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 3196979.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.13098981976509094,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 35
|
|
},
|
|
{
|
|
"advantages/mean": 6.05359673500061e-09,
|
|
"advantages/snr": 1.1579395756169161e-08,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 0.1032258064516129,
|
|
"grad_norm": 0.16735976169723088,
|
|
"learning_rate": 1.9992246123524646e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 3284771.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.13781969249248505,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 36
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962665216109293e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.1060931899641577,
|
|
"grad_norm": 0.12078546374354511,
|
|
"learning_rate": 1.999179677662692e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 3367767.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 37
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907268126346096e-09,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.10896057347670252,
|
|
"grad_norm": 0.10079542415136743,
|
|
"learning_rate": 1.999133477909016e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 3457120.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 38
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 5.269660554926477e-09,
|
|
"advantages/std": 0.6185652613639832,
|
|
"advantages/var": 0.3826229825662928,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.11182795698924732,
|
|
"grad_norm": 6.532784934364094,
|
|
"learning_rate": 1.9990860131499304e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 3543851.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.1820138692855835,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 39
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.344354173221399e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.11469534050179211,
|
|
"grad_norm": 0.09757495386632052,
|
|
"learning_rate": 1.9990372834455305e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 3627069.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 40
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.2524670566012098e-09,
|
|
"advantages/std": 0.5726868510246277,
|
|
"advantages/var": 0.3279702293365041,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.11756272401433691,
|
|
"grad_norm": 0.09730325289294997,
|
|
"learning_rate": 1.9989872888575127e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 3711073.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.16545338928699493,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 41
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917148060443483e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.12043010752688173,
|
|
"grad_norm": 0.0874621365795184,
|
|
"learning_rate": 1.9989360294491754e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 3797362.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 42
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 2.8166759048326044e-09,
|
|
"advantages/std": 0.6612919569015503,
|
|
"advantages/var": 0.43730705226268185,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.515625,
|
|
"epoch": 0.12329749103942653,
|
|
"grad_norm": 0.1518769862850437,
|
|
"learning_rate": 1.9988835052854186e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 3912127.0,
|
|
"reward": 0.4140625,
|
|
"reward_std": 0.23250606656074524,
|
|
"rewards/drgrpo_math_reward/mean": 0.4140625,
|
|
"rewards/drgrpo_math_reward/std": 0.49449479579925537,
|
|
"step": 43
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.065538968228456e-09,
|
|
"advantages/std": 0.5726931691169739,
|
|
"advantages/var": 0.32797746595324284,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.71875,
|
|
"epoch": 0.12616487455197134,
|
|
"grad_norm": 0.09380987299728875,
|
|
"learning_rate": 1.9988297164327424e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 4006775.0,
|
|
"reward": 0.6015625,
|
|
"reward_std": 0.172288179397583,
|
|
"rewards/drgrpo_math_reward/mean": 0.6015625,
|
|
"rewards/drgrpo_math_reward/std": 0.4915000796318054,
|
|
"step": 44
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.979341600636731e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.12903225806451613,
|
|
"grad_norm": 0.12294840293247179,
|
|
"learning_rate": 1.9987746629592504e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 4100127.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.1054728701710701,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 45
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.13189964157706094,
|
|
"grad_norm": 0.10521562596882879,
|
|
"learning_rate": 1.9987183449346446e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 4177228.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 46
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 2.6555842735419153e-09,
|
|
"advantages/std": 0.7014068961143494,
|
|
"advantages/var": 0.4919716339167657,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.59375,
|
|
"epoch": 0.13476702508960572,
|
|
"grad_norm": 0.14721937323309717,
|
|
"learning_rate": 1.9986607624302303e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 4285112.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.26249876618385315,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 47
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 6.899540529955257e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.13763440860215054,
|
|
"grad_norm": 0.08261311695650624,
|
|
"learning_rate": 1.9986019155189124e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 4385910.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 48
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.2196811125523014e-08,
|
|
"advantages/std": 0.5726840496063232,
|
|
"advantages/var": 0.3279670206734977,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.14050179211469535,
|
|
"grad_norm": 0.11604750525829459,
|
|
"learning_rate": 1.9985418042751972e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 4474601.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.16097761690616608,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 49
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 5.633320329065741e-09,
|
|
"advantages/std": 0.6612956523895264,
|
|
"advantages/var": 0.4373119398692893,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 0.14336917562724014,
|
|
"grad_norm": 0.09818812063392984,
|
|
"learning_rate": 1.9984804287751916e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 4573799.0,
|
|
"reward": 0.5859375,
|
|
"reward_std": 0.23592591285705566,
|
|
"rewards/drgrpo_math_reward/mean": 0.5859375,
|
|
"rewards/drgrpo_math_reward/std": 0.49449479579925537,
|
|
"step": 50
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.979341600636731e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 0.14623655913978495,
|
|
"grad_norm": 0.102057115147288,
|
|
"learning_rate": 1.9984177890966035e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 4654716.0,
|
|
"reward": 0.5859375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.5859375,
|
|
"rewards/drgrpo_math_reward/std": 0.49449479579925537,
|
|
"step": 51
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.4535487590567085e-09,
|
|
"advantages/std": 0.5227980017662048,
|
|
"advantages/var": 0.2733177506507367,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.14910394265232976,
|
|
"grad_norm": 0.11635955339733696,
|
|
"learning_rate": 1.998353885318741e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 4738961.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.14571532607078552,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 52
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.011143494991813e-09,
|
|
"advantages/std": 0.6185839772224426,
|
|
"advantages/var": 0.3826461368763354,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.609375,
|
|
"epoch": 0.15197132616487455,
|
|
"grad_norm": 0.1053439223761662,
|
|
"learning_rate": 1.9982887175225135e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 4833833.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.20463991165161133,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 53
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 5.691829197043768e-09,
|
|
"advantages/std": 0.5726856589317322,
|
|
"advantages/var": 0.3279688639460723,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 0.15483870967741936,
|
|
"grad_norm": 0.14214478632458308,
|
|
"learning_rate": 1.9982222857904287e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 4920722.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.16333173215389252,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 54
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.0349153895649778e-08,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.15770609318996415,
|
|
"grad_norm": 0.07374782139639248,
|
|
"learning_rate": 1.9981545902065973e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 5007022.0,
|
|
"reward": 0.5859375,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.5859375,
|
|
"rewards/drgrpo_math_reward/std": 0.49449479579925537,
|
|
"step": 55
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962814881146116e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 0.16057347670250896,
|
|
"grad_norm": 0.062216704713937636,
|
|
"learning_rate": 1.998085630856728e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 5086937.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.1054728701710701,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 56
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 7.041880175145966e-10,
|
|
"advantages/std": 0.6612740755081177,
|
|
"advantages/var": 0.4372834029391157,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.16344086021505377,
|
|
"grad_norm": 0.1276968079977411,
|
|
"learning_rate": 1.998015407828131e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 5187209.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.20964756608009338,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 57
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675983786582947,
|
|
"advantages/var": 0.21864824372386593,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 0.16630824372759856,
|
|
"grad_norm": 0.08001414737929904,
|
|
"learning_rate": 1.997943921209715e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 5288667.0,
|
|
"reward": 0.5546875,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.5546875,
|
|
"rewards/drgrpo_math_reward/std": 0.4989531338214874,
|
|
"step": 58
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814195036226835e-09,
|
|
"advantages/std": 0.5227980017662048,
|
|
"advantages/var": 0.2733177506507367,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.16917562724014337,
|
|
"grad_norm": 0.13591135236989843,
|
|
"learning_rate": 1.997871171091991e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 5378634.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.14571532607078552,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 59
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 6.005323810328915e-09,
|
|
"advantages/std": 0.7754141092300415,
|
|
"advantages/var": 0.6012670407930187,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.17204301075268819,
|
|
"grad_norm": 0.15618674804291033,
|
|
"learning_rate": 1.9977971575670664e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 5474256.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.2885475754737854,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 60
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.17491039426523297,
|
|
"grad_norm": 0.1138327127178479,
|
|
"learning_rate": 1.9977218807286505e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 5563557.0,
|
|
"reward": 0.5,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.5,
|
|
"rewards/drgrpo_math_reward/std": 0.5019646286964417,
|
|
"step": 61
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.011196882346898e-09,
|
|
"advantages/std": 0.618573009967804,
|
|
"advantages/var": 0.3826325686606289,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.703125,
|
|
"epoch": 0.17777777777777778,
|
|
"grad_norm": 0.08132114696040446,
|
|
"learning_rate": 1.997645340672052e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 5663385.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.18885357677936554,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 62
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.300980052408865e-08,
|
|
"advantages/std": 0.5726898312568665,
|
|
"advantages/var": 0.3279736428250182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.18064516129032257,
|
|
"grad_norm": 0.10257084432126386,
|
|
"learning_rate": 1.9975675374941777e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 5747683.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.16675157845020294,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 63
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.0224889366485245e-09,
|
|
"advantages/std": 0.6185632348060608,
|
|
"advantages/var": 0.3826204754537379,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.18351254480286738,
|
|
"grad_norm": 0.23992414863040273,
|
|
"learning_rate": 1.9974884712935348e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 5822682.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.17859892547130585,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 64
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125814501076877e-09,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 0.1863799283154122,
|
|
"grad_norm": 0.11197749985688031,
|
|
"learning_rate": 1.9974081421702293e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 5923545.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 65
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.562938529588136e-09,
|
|
"advantages/std": 0.5227833986282349,
|
|
"advantages/var": 0.2733024818812879,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 0.18924731182795698,
|
|
"grad_norm": 0.11472039064281611,
|
|
"learning_rate": 1.997326550225966e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 6024457.0,
|
|
"reward": 0.3828125,
|
|
"reward_std": 0.13098490238189697,
|
|
"rewards/drgrpo_math_reward/mean": 0.3828125,
|
|
"rewards/drgrpo_math_reward/std": 0.4879830479621887,
|
|
"step": 66
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 6.504965933612274e-09,
|
|
"advantages/std": 0.5726840496063232,
|
|
"advantages/var": 0.3279670206734977,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.1921146953405018,
|
|
"grad_norm": 0.1418423341425889,
|
|
"learning_rate": 1.9972436955640485e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 6104953.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.16097761690616608,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 67
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.1949820788530466,
|
|
"grad_norm": 0.0804894342413209,
|
|
"learning_rate": 1.9971595782893793e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 6186002.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 68
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.13121587994051e-09,
|
|
"advantages/std": 0.5726834535598755,
|
|
"advantages/var": 0.32796633798126607,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.1978494623655914,
|
|
"grad_norm": 0.11992450664488502,
|
|
"learning_rate": 1.99707419850846e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 6273682.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 69
|
|
},
|
|
{
|
|
"advantages/mean": 4.6566128730773926e-09,
|
|
"advantages/snr": 8.90714930920747e-09,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.59375,
|
|
"epoch": 0.2007168458781362,
|
|
"grad_norm": 0.09013588232439032,
|
|
"learning_rate": 1.9969875563293894e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 6370954.0,
|
|
"reward": 0.53125,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.53125,
|
|
"rewards/drgrpo_math_reward/std": 0.5009832978248596,
|
|
"step": 70
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 5.633244590331673e-09,
|
|
"advantages/std": 0.33065226674079895,
|
|
"advantages/var": 0.10933092150082846,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.203584229390681,
|
|
"grad_norm": 0.08158329080487958,
|
|
"learning_rate": 1.996899651861866e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 6459977.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.06233953312039375,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 71
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 5.03866787895554e-09,
|
|
"advantages/std": 0.7393403053283691,
|
|
"advantages/var": 0.5466240870830461,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 0.2064516129032258,
|
|
"grad_norm": 0.16017306287798955,
|
|
"learning_rate": 1.996810485217186e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 6558404.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.28223681449890137,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 72
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 9.958683201273463e-10,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.20931899641577062,
|
|
"grad_norm": 0.08280502539118079,
|
|
"learning_rate": 1.9967200565082424e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 6640957.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 73
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 8.630640087964663e-09,
|
|
"advantages/std": 0.7014076113700867,
|
|
"advantages/var": 0.49197263728789054,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 0.2121863799283154,
|
|
"grad_norm": 0.15761095907087838,
|
|
"learning_rate": 1.9966283658495283e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 6743334.0,
|
|
"reward": 0.6171875,
|
|
"reward_std": 0.2603819966316223,
|
|
"rewards/drgrpo_math_reward/mean": 0.6171875,
|
|
"rewards/drgrpo_math_reward/std": 0.4879830479621887,
|
|
"step": 74
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 2.816762225733049e-09,
|
|
"advantages/std": 0.6612716913223267,
|
|
"advantages/var": 0.4372802497442905,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.21505376344086022,
|
|
"grad_norm": 0.19829770317655657,
|
|
"learning_rate": 1.996535413357133e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 6831502.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.20517179369926453,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 75
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.408361690336558e-09,
|
|
"advantages/std": 0.6612808108329773,
|
|
"advantages/var": 0.4372923107759199,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 0.21792114695340503,
|
|
"grad_norm": 0.18264161110922233,
|
|
"learning_rate": 1.9964411991487446e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 6926494.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.21542644500732422,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 76
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 1.126665165824288e-08,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 0.22078853046594982,
|
|
"grad_norm": 0.03928779024896764,
|
|
"learning_rate": 1.9963457233436466e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 7017039.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 77
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 2.5193428761467165e-09,
|
|
"advantages/std": 0.739337682723999,
|
|
"advantages/var": 0.5466202090956926,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.22365591397849463,
|
|
"grad_norm": 0.2506819296373822,
|
|
"learning_rate": 1.9962489860627224e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 7110808.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.2767002582550049,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 78
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 4.225131532618201e-09,
|
|
"advantages/std": 0.6612735390663147,
|
|
"advantages/var": 0.43728269346928883,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.22652329749103942,
|
|
"grad_norm": 0.1557864040641026,
|
|
"learning_rate": 1.9961509874284507e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 7196850.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.20858672261238098,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 79
|
|
},
|
|
{
|
|
"advantages/mean": 4.6566128730773926e-09,
|
|
"advantages/snr": 9.958501673983143e-09,
|
|
"advantages/std": 0.4676017463207245,
|
|
"advantages/var": 0.21865139316219118,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 0.22939068100358423,
|
|
"grad_norm": 0.09767097714609889,
|
|
"learning_rate": 1.9960517275649076e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 7283197.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.11336850374937057,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 80
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.23225806451612904,
|
|
"grad_norm": 0.07097210651744987,
|
|
"learning_rate": 1.995951206597767e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 7373022.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 81
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.562875972037933e-09,
|
|
"advantages/std": 0.5227925777435303,
|
|
"advantages/var": 0.27331207934372515,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.71875,
|
|
"epoch": 0.23512544802867383,
|
|
"grad_norm": 0.07965713683163471,
|
|
"learning_rate": 1.9958494246542984e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 7467194.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.13782459497451782,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 82
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125995678848164e-09,
|
|
"advantages/std": 0.5227746963500977,
|
|
"advantages/var": 0.2732933831439368,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.23799283154121864,
|
|
"grad_norm": 0.18075885139952805,
|
|
"learning_rate": 1.9957463818633678e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 7547836.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.12179600447416306,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 83
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.504965933612274e-09,
|
|
"advantages/std": 0.5726840496063232,
|
|
"advantages/var": 0.3279670206734977,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.609375,
|
|
"epoch": 0.24086021505376345,
|
|
"grad_norm": 0.11479922684896822,
|
|
"learning_rate": 1.9956420783554387e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 7651354.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.16097760200500488,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 84
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 8.94433653701192e-09,
|
|
"advantages/std": 0.5726835131645203,
|
|
"advantages/var": 0.32796640625045725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.24372759856630824,
|
|
"grad_norm": 0.06924948592705259,
|
|
"learning_rate": 1.9955365142625695e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 7740603.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 85
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.0112157424628863e-09,
|
|
"advantages/std": 0.6185691356658936,
|
|
"advantages/var": 0.3826277755984506,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 0.24659498207885305,
|
|
"grad_norm": 0.11054180140229349,
|
|
"learning_rate": 1.9954296897184152e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 7833309.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.18543371558189392,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 86
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.971194650918909e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 0.24946236559139784,
|
|
"grad_norm": 0.07139527483113263,
|
|
"learning_rate": 1.9953216048582267e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 7920489.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 87
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 6.505166341645741e-09,
|
|
"advantages/std": 0.5726664066314697,
|
|
"advantages/var": 0.32794681328419983,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.2523297491039427,
|
|
"grad_norm": 0.0812733661592712,
|
|
"learning_rate": 1.99521225981885e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 8009754.0,
|
|
"reward": 0.5859375,
|
|
"reward_std": 0.13941730558872223,
|
|
"rewards/drgrpo_math_reward/mean": 0.5859375,
|
|
"rewards/drgrpo_math_reward/std": 0.49449479579925537,
|
|
"step": 88
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.252425081488457e-09,
|
|
"advantages/std": 0.5726942420005798,
|
|
"advantages/var": 0.3279786948206187,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.25519713261648747,
|
|
"grad_norm": 0.1334826079479053,
|
|
"learning_rate": 1.9951016547387284e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 8094420.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.17358146607875824,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 89
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.979305421724224e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.671875,
|
|
"epoch": 0.25806451612903225,
|
|
"grad_norm": 0.11149895094992536,
|
|
"learning_rate": 1.994989789757898e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 8187321.0,
|
|
"reward": 0.5546875,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.5546875,
|
|
"rewards/drgrpo_math_reward/std": 0.4989531338214874,
|
|
"step": 90
|
|
},
|
|
{
|
|
"advantages/mean": 6.05359673500061e-09,
|
|
"advantages/snr": 1.0570293504372942e-08,
|
|
"advantages/std": 0.5726990103721619,
|
|
"advantages/var": 0.32798415648125356,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.26093189964157704,
|
|
"grad_norm": 0.1503200281820486,
|
|
"learning_rate": 1.9948766650179924e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 8277097.0,
|
|
"reward": 0.5078125,
|
|
"reward_std": 0.17806214094161987,
|
|
"rewards/drgrpo_math_reward/mean": 0.5078125,
|
|
"rewards/drgrpo_math_reward/std": 0.5019033551216125,
|
|
"step": 91
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.2637992831541219,
|
|
"grad_norm": 0.08503746578744582,
|
|
"learning_rate": 1.9947622806622382e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 8377236.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 92
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.2674839170101503e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.26666666666666666,
|
|
"grad_norm": 0.05712285492918599,
|
|
"learning_rate": 1.994646636835458e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 8462836.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 93
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 9.958849501312727e-10,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.26953405017921145,
|
|
"grad_norm": 0.07317141316141637,
|
|
"learning_rate": 1.9945297336840688e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 8536756.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 94
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344599133202926e-09,
|
|
"advantages/std": 0.5227646827697754,
|
|
"advantages/var": 0.2732829135513839,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.2724014336917563,
|
|
"grad_norm": 0.07050755865291349,
|
|
"learning_rate": 1.994411571356082e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 8627122.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.11048543453216553,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 95
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.78141726951285e-09,
|
|
"advantages/std": 0.5227986574172974,
|
|
"advantages/var": 0.27331843619732865,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.2752688172043011,
|
|
"grad_norm": 0.09077858932382432,
|
|
"learning_rate": 1.9942921500011035e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 8704876.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.14677615463733673,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 96
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 1.0349232344696665e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 0.27813620071684586,
|
|
"grad_norm": 0.06366679062939669,
|
|
"learning_rate": 1.994171469770333e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 8788584.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 97
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.5056352916096475e-09,
|
|
"advantages/std": 0.618557870388031,
|
|
"advantages/var": 0.38261383901897617,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 0.2810035842293907,
|
|
"grad_norm": 0.14557426372001275,
|
|
"learning_rate": 1.994049530816563e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 8874456.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.17282496392726898,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 98
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-09,
|
|
"advantages/snr": 1.4083628328234916e-08,
|
|
"advantages/std": 0.6612802743911743,
|
|
"advantages/var": 0.4372916012988668,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.2838709677419355,
|
|
"grad_norm": 0.17422381430621553,
|
|
"learning_rate": 1.993926333294182e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 8963952.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.21436560153961182,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 99
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 0.2867383512544803,
|
|
"grad_norm": 0.07608279904949532,
|
|
"learning_rate": 1.9938018773591697e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 9039636.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 100
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.011171929325739e-09,
|
|
"advantages/std": 0.6185781359672546,
|
|
"advantages/var": 0.38263891029672337,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.2896057347670251,
|
|
"grad_norm": 0.13534176496966346,
|
|
"learning_rate": 1.9936761631691005e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 9122751.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.19780512154102325,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 101
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.327802856091802e-09,
|
|
"advantages/std": 0.7014012336730957,
|
|
"advantages/var": 0.4919636905981406,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 0.2924731182795699,
|
|
"grad_norm": 0.17364832338354966,
|
|
"learning_rate": 1.993549190883142e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 9210361.0,
|
|
"reward": 0.5625,
|
|
"reward_std": 0.25460314750671387,
|
|
"rewards/drgrpo_math_reward/mean": 0.5625,
|
|
"rewards/drgrpo_math_reward/std": 0.49802759289741516,
|
|
"step": 102
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.757447884868554e-09,
|
|
"advantages/std": 0.572684109210968,
|
|
"advantages/var": 0.32796708894275994,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.2953405017921147,
|
|
"grad_norm": 0.1186100841602063,
|
|
"learning_rate": 1.9934209606620533e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 9287648.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.16097761690616608,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 103
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 0.2982078853046595,
|
|
"grad_norm": 0.05043194574683359,
|
|
"learning_rate": 1.993291472668187e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 9365932.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 104
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.439391676409106e-09,
|
|
"advantages/std": 0.5726771354675293,
|
|
"advantages/var": 0.3279591014872949,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.3010752688172043,
|
|
"grad_norm": 0.1576253855421573,
|
|
"learning_rate": 1.993160727065489e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 9438450.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.1530819982290268,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 105
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.757241732624561e-09,
|
|
"advantages/std": 0.5726962089538574,
|
|
"advantages/var": 0.3279809477501203,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 0.3039426523297491,
|
|
"grad_norm": 0.09728783396370186,
|
|
"learning_rate": 1.9930287240194956e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 9533008.0,
|
|
"reward": 0.5546875,
|
|
"reward_std": 0.17358636856079102,
|
|
"rewards/drgrpo_math_reward/mean": 0.5546875,
|
|
"rewards/drgrpo_math_reward/std": 0.4989531338214874,
|
|
"step": 106
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.493752592678455e-08,
|
|
"advantages/std": 0.46760883927345276,
|
|
"advantages/var": 0.21865802656666578,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.3068100358422939,
|
|
"grad_norm": 0.15227602098741566,
|
|
"learning_rate": 1.992895463697337e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 9614514.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.11914245784282684,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 107
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917699002625455e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.3096774193548387,
|
|
"grad_norm": 0.08145354636826882,
|
|
"learning_rate": 1.992760946267734e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 9699069.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 108
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907308748317195e-09,
|
|
"advantages/std": 0.5227856040000916,
|
|
"advantages/var": 0.27330478774974054,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 0.3125448028673835,
|
|
"grad_norm": 0.08781585258601152,
|
|
"learning_rate": 1.9926251719009997e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 9788484.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.13098981976509094,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 109
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.3010125501632784e-08,
|
|
"advantages/std": 0.5726755261421204,
|
|
"advantages/var": 0.3279572582421544,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 0.3154121863799283,
|
|
"grad_norm": 0.08383385156663044,
|
|
"learning_rate": 1.9924881407690383e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 9886743.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.15072788298130035,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 110
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344329800322181e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.31827956989247314,
|
|
"grad_norm": 0.12773517568175324,
|
|
"learning_rate": 1.9923498530453453e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 9972379.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.13888053596019745,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 111
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.5055823378090512e-09,
|
|
"advantages/std": 0.618579626083374,
|
|
"advantages/var": 0.3826407538054468,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.3211469534050179,
|
|
"grad_norm": 0.1096834650176715,
|
|
"learning_rate": 1.992210308905007e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 10064282.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.2001592218875885,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 112
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814215346364857e-09,
|
|
"advantages/std": 0.5227974057197571,
|
|
"advantages/var": 0.2733171274273083,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.3240143369175627,
|
|
"grad_norm": 0.14542735807746063,
|
|
"learning_rate": 1.992069508524701e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 10156504.0,
|
|
"reward": 0.6171875,
|
|
"reward_std": 0.14465448260307312,
|
|
"rewards/drgrpo_math_reward/mean": 0.6171875,
|
|
"rewards/drgrpo_math_reward/std": 0.4879830479621887,
|
|
"step": 113
|
|
},
|
|
{
|
|
"advantages/mean": 5.587935447692871e-09,
|
|
"advantages/snr": 8.45011606469443e-09,
|
|
"advantages/std": 0.6612850427627563,
|
|
"advantages/var": 0.4372979077817405,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.32688172043010755,
|
|
"grad_norm": 0.1774709869930943,
|
|
"learning_rate": 1.9919274520826937e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 10237639.0,
|
|
"reward": 0.5625,
|
|
"reward_std": 0.22331714630126953,
|
|
"rewards/drgrpo_math_reward/mean": 0.5625,
|
|
"rewards/drgrpo_math_reward/std": 0.49802759289741516,
|
|
"step": 114
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 3.449820789068217e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.32974910394265233,
|
|
"grad_norm": 0.11207851970747312,
|
|
"learning_rate": 1.991784139758845e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 10308114.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 115
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.149905988405531e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 0.3326164874551971,
|
|
"grad_norm": 0.061326552584535385,
|
|
"learning_rate": 1.9916395717346014e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 10389388.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 116
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299897960206267e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.33548387096774196,
|
|
"grad_norm": 0.17403189241869,
|
|
"learning_rate": 1.991493748193002e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 10475315.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 117
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299880526045478e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.33835125448028674,
|
|
"grad_norm": 0.05760583506548298,
|
|
"learning_rate": 1.991346669318674e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 10547767.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 118
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 7.527911689045256e-09,
|
|
"advantages/std": 0.618579626083374,
|
|
"advantages/var": 0.3826407538054468,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.34121863799283153,
|
|
"grad_norm": 0.13250194607386148,
|
|
"learning_rate": 1.991198335297834e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 10639778.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.2001592367887497,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 119
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7814298618414938e-09,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.34408602150537637,
|
|
"grad_norm": 0.11855103268565774,
|
|
"learning_rate": 1.9910487463182875e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 10722543.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 120
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.6722127326138678e-09,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.34695340501792116,
|
|
"grad_norm": 0.11898391996372083,
|
|
"learning_rate": 1.990897902569431e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 10807430.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 121
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 3.763904706701757e-09,
|
|
"advantages/std": 0.6185880303382874,
|
|
"advantages/var": 0.3826511512778019,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 0.34982078853046594,
|
|
"grad_norm": 0.13013435976302207,
|
|
"learning_rate": 1.990745804242247e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 10908671.0,
|
|
"reward": 0.3984375,
|
|
"reward_std": 0.21146979928016663,
|
|
"rewards/drgrpo_math_reward/mean": 0.3984375,
|
|
"rewards/drgrpo_math_reward/std": 0.4915000796318054,
|
|
"step": 122
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917366402546925e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.71875,
|
|
"epoch": 0.35268817204301073,
|
|
"grad_norm": 0.07003152151091946,
|
|
"learning_rate": 1.9905924515293086e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 10998970.0,
|
|
"reward": 0.5546875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.5546875,
|
|
"rewards/drgrpo_math_reward/std": 0.4989531338214874,
|
|
"step": 123
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 1.034946312888557e-08,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.35555555555555557,
|
|
"grad_norm": 0.16702168626513209,
|
|
"learning_rate": 1.990437844624775e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 11077854.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 124
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 6.005250875554911e-09,
|
|
"advantages/std": 0.775423526763916,
|
|
"advantages/var": 0.6012816458589896,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.35842293906810035,
|
|
"grad_norm": 0.22470812589114758,
|
|
"learning_rate": 1.990281983724395e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 11156158.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.30221718549728394,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 125
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.36129032258064514,
|
|
"grad_norm": 0.07239011287954675,
|
|
"learning_rate": 1.9901248690255043e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 11242573.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 126
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.504917864602565e-09,
|
|
"advantages/std": 0.5726882815361023,
|
|
"advantages/var": 0.32797186780877396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.36415770609319,
|
|
"grad_norm": 0.11009253074124153,
|
|
"learning_rate": 1.989966500727026e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 11330169.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.1643974632024765,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 127
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 0.36702508960573477,
|
|
"grad_norm": 0.11712893123786251,
|
|
"learning_rate": 1.989806879029471e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 11421447.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 128
|
|
},
|
|
{
|
|
"advantages/mean": -9.778887033462524e-09,
|
|
"advantages/snr": 1.3226438138076389e-08,
|
|
"advantages/std": 0.7393439412117004,
|
|
"advantages/var": 0.5466294634064504,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.65625,
|
|
"epoch": 0.36989247311827955,
|
|
"grad_norm": 0.18004967601954425,
|
|
"learning_rate": 1.9896460041349366e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 11528509.0,
|
|
"reward": 0.5,
|
|
"reward_std": 0.2858891487121582,
|
|
"rewards/drgrpo_math_reward/mean": 0.5,
|
|
"rewards/drgrpo_math_reward/std": 0.5019646286964417,
|
|
"step": 129
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 8.907149309207469e-10,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 0.3727598566308244,
|
|
"grad_norm": 0.1020692604987898,
|
|
"learning_rate": 1.989483876247107e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 11621139.0,
|
|
"reward": 0.609375,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.609375,
|
|
"rewards/drgrpo_math_reward/std": 0.4898075461387634,
|
|
"step": 130
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899474012416308e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 0.3756272401433692,
|
|
"grad_norm": 0.07865944799990532,
|
|
"learning_rate": 1.9893204955712522e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 11694715.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 131
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599796258942519e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.37849462365591396,
|
|
"grad_norm": 0.06802770153495377,
|
|
"learning_rate": 1.98915586231423e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 11770514.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 132
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.3813620071684588,
|
|
"grad_norm": 0.08913216962341508,
|
|
"learning_rate": 1.9889899766844814e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 11859752.0,
|
|
"reward": 0.609375,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.609375,
|
|
"rewards/drgrpo_math_reward/std": 0.4898075461387634,
|
|
"step": 133
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 5.691872442631884e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.3842293906810036,
|
|
"grad_norm": 0.07666960009104674,
|
|
"learning_rate": 1.9888228388920358e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 11948859.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 134
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.3870967741935484,
|
|
"grad_norm": 0.06749265175035522,
|
|
"learning_rate": 1.9886544491485064e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 12030381.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 135
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.661277174949646,
|
|
"advantages/var": 0.4372875021093847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.3899641577060932,
|
|
"grad_norm": 0.09757012597819024,
|
|
"learning_rate": 1.988484807667092e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 12129427.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.2120065838098526,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 136
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.2470284853558477e-08,
|
|
"advantages/std": 0.5227833986282349,
|
|
"advantages/var": 0.2733024818812879,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 0.392831541218638,
|
|
"grad_norm": 0.12809089614582359,
|
|
"learning_rate": 1.9883139146625762e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 12201685.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.13098490238189697,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 137
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675973355770111,
|
|
"advantages/var": 0.21864726823871994,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.3956989247311828,
|
|
"grad_norm": 0.0828590398375662,
|
|
"learning_rate": 1.988141770351326e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 12283527.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.11100947856903076,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 138
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344599133202926e-09,
|
|
"advantages/std": 0.5227646827697754,
|
|
"advantages/var": 0.2732829135513839,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 0.3985663082437276,
|
|
"grad_norm": 0.12077351927878063,
|
|
"learning_rate": 1.987968374951296e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 12363347.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.11048543453216553,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 139
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899627360122966e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.703125,
|
|
"epoch": 0.4014336917562724,
|
|
"grad_norm": 0.08131195331107871,
|
|
"learning_rate": 1.9877937286820203e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 12451379.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 140
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 4.599658819865184e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 0.4043010752688172,
|
|
"grad_norm": 0.10033367759342848,
|
|
"learning_rate": 1.9876178317646203e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 12522473.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 141
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.4394010705092733e-09,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.71875,
|
|
"epoch": 0.407168458781362,
|
|
"grad_norm": 0.1899150573095275,
|
|
"learning_rate": 1.9874406844217987e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 12620698.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 142
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.8994882297977766e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.4100358422939068,
|
|
"grad_norm": 0.10722039505936129,
|
|
"learning_rate": 1.9872622868778427e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 12707291.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 143
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.6721560650546284e-09,
|
|
"advantages/std": 0.5227927565574646,
|
|
"advantages/var": 0.27331226630895245,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.4129032258064516,
|
|
"grad_norm": 0.18915905416870124,
|
|
"learning_rate": 1.987082639358622e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 12794653.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.1412346363067627,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 144
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.516881500847056e-09,
|
|
"advantages/std": 0.6185612082481384,
|
|
"advantages/var": 0.3826179683493969,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.4157706093189964,
|
|
"grad_norm": 0.10301029280811545,
|
|
"learning_rate": 1.9869017420915886e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 12891348.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.1751839816570282,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 145
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.0570528935649014e-08,
|
|
"advantages/std": 0.5726862549781799,
|
|
"advantages/var": 0.3279695466409329,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.41863799283154124,
|
|
"grad_norm": 0.10928504117762153,
|
|
"learning_rate": 1.9867195953057764e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 12968944.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.16439256072044373,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 146
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.6262611176060706e-09,
|
|
"advantages/std": 0.5726771354675293,
|
|
"advantages/var": 0.3279591014872949,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.421505376344086,
|
|
"grad_norm": 0.14184961527585777,
|
|
"learning_rate": 1.986536199231803e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 13064416.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.1530819833278656,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 147
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5629072505384383e-09,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.65625,
|
|
"epoch": 0.4243727598566308,
|
|
"grad_norm": 0.07271219756156644,
|
|
"learning_rate": 1.986351554101866e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 13156550.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 148
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.991689037214316e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.42724014336917565,
|
|
"grad_norm": 0.14713350490945606,
|
|
"learning_rate": 1.986165660149745e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 13234866.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 149
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7814038649715254e-09,
|
|
"advantages/std": 0.5228025913238525,
|
|
"advantages/var": 0.2733225494949352,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.43010752688172044,
|
|
"grad_norm": 0.1210857791775736,
|
|
"learning_rate": 1.985978517610801e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 13315749.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.14913517236709595,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 150
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 0.4329749103942652,
|
|
"grad_norm": 0.09906878327713253,
|
|
"learning_rate": 1.9857901267219754e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 13407117.0,
|
|
"reward": 0.5546875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.5546875,
|
|
"rewards/drgrpo_math_reward/std": 0.4989531338214874,
|
|
"step": 151
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.235004516445228e-09,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.43584229390681006,
|
|
"grad_norm": 0.06565110128546628,
|
|
"learning_rate": 1.9856004877217905e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 13490088.0,
|
|
"reward": 0.609375,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.609375,
|
|
"rewards/drgrpo_math_reward/std": 0.4898075461387634,
|
|
"step": 152
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.6185716986656189,
|
|
"advantages/var": 0.3826309463900692,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.43870967741935485,
|
|
"grad_norm": 0.19932970995255095,
|
|
"learning_rate": 1.9854096008503493e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 13581609.0,
|
|
"reward": 0.578125,
|
|
"reward_std": 0.18990948796272278,
|
|
"rewards/drgrpo_math_reward/mean": 0.578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4957992732524872,
|
|
"step": 153
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.344599133202926e-09,
|
|
"advantages/std": 0.5227646827697754,
|
|
"advantages/var": 0.2732829135513839,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.44157706093189963,
|
|
"grad_norm": 0.11456953029288426,
|
|
"learning_rate": 1.9852174663493334e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 13658704.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.11048543453216553,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 154
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 9.199247230235001e-09,
|
|
"advantages/std": 0.40495598316192627,
|
|
"advantages/var": 0.1639893482986423,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.4444444444444444,
|
|
"grad_norm": 0.07376191063082546,
|
|
"learning_rate": 1.9850240844620046e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 13735606.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 155
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975300560494157e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.609375,
|
|
"epoch": 0.44731182795698926,
|
|
"grad_norm": 0.09368775004559605,
|
|
"learning_rate": 1.9848294554332047e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 13825703.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 156
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975144418133046e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.45017921146953405,
|
|
"grad_norm": 0.10344565808718491,
|
|
"learning_rate": 1.9846335795093543e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 13901866.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 157
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4049627482891083,
|
|
"advantages/var": 0.16399482750186767,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.45304659498207883,
|
|
"grad_norm": 0.06445645006262131,
|
|
"learning_rate": 1.9844364569384516e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 13989135.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.09127141535282135,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 158
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 1.2674839170101503e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.4559139784946237,
|
|
"grad_norm": 0.061078732138918075,
|
|
"learning_rate": 1.984238087970075e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 14079134.0,
|
|
"reward": 0.6015625,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.6015625,
|
|
"rewards/drgrpo_math_reward/std": 0.4915000796318054,
|
|
"step": 159
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.252482966806137e-09,
|
|
"advantages/std": 0.5726840496063232,
|
|
"advantages/var": 0.3279670206734977,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 0.45878136200716846,
|
|
"grad_norm": 0.10308714282902937,
|
|
"learning_rate": 1.9840384728553785e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 14176262.0,
|
|
"reward": 0.546875,
|
|
"reward_std": 0.16097761690616608,
|
|
"rewards/drgrpo_math_reward/mean": 0.546875,
|
|
"rewards/drgrpo_math_reward/std": 0.4997538626194,
|
|
"step": 160
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.03370380844539e-09,
|
|
"advantages/std": 0.6185652613639832,
|
|
"advantages/var": 0.3826229825662928,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.46164874551971324,
|
|
"grad_norm": 0.14498186715974062,
|
|
"learning_rate": 1.983837611847096e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 14270346.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.1820138692855835,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 161
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 3.764028020123724e-09,
|
|
"advantages/std": 0.6185677647590637,
|
|
"advantages/var": 0.3826260795990244,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.4645161290322581,
|
|
"grad_norm": 0.19613298266036122,
|
|
"learning_rate": 1.9836355051995393e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 14359158.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.18648964166641235,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 162
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.991766726549734e-09,
|
|
"advantages/std": 0.4675861597061157,
|
|
"advantages/var": 0.21863681674871316,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.46738351254480287,
|
|
"grad_norm": 0.10450024700019477,
|
|
"learning_rate": 1.9834321531685943e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 14444276.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 163
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.979230209351863e-09,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 0.47025089605734766,
|
|
"grad_norm": 0.06584065416507832,
|
|
"learning_rate": 1.9832275560117267e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 14533311.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 164
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.7393245100975037,
|
|
"advantages/var": 0.5466007312309138,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.671875,
|
|
"epoch": 0.4731182795698925,
|
|
"grad_norm": 0.21015750556078078,
|
|
"learning_rate": 1.9830217139879765e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 14634245.0,
|
|
"reward": 0.5859375,
|
|
"reward_std": 0.25620073080062866,
|
|
"rewards/drgrpo_math_reward/mean": 0.5859375,
|
|
"rewards/drgrpo_math_reward/std": 0.49449479579925537,
|
|
"step": 165
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.4948747045412387e-08,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.4759856630824373,
|
|
"grad_norm": 0.09577320769164782,
|
|
"learning_rate": 1.982814627357962e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 14715730.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 166
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.7014028429985046,
|
|
"advantages/var": 0.49196594816638495,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.47885304659498207,
|
|
"grad_norm": 0.15319625735144188,
|
|
"learning_rate": 1.982606296383875e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 14803933.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.25460803508758545,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 167
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.4817204301075269,
|
|
"grad_norm": 0.06886659174337954,
|
|
"learning_rate": 1.982396721329485e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 14895037.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 168
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.967257841033182e-09,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.4845878136200717,
|
|
"grad_norm": 0.08315863659595965,
|
|
"learning_rate": 1.9821859024601343e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 14977313.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 169
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.4874551971326165,
|
|
"grad_norm": 0.07170340697141077,
|
|
"learning_rate": 1.981973840042742e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 15053244.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 170
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917335935919526e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.49032258064516127,
|
|
"grad_norm": 0.0928601930313996,
|
|
"learning_rate": 1.9817605343458004e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 15127424.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 171
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 6.504965933612274e-09,
|
|
"advantages/std": 0.5726840496063232,
|
|
"advantages/var": 0.3279670206734977,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 0.4931899641577061,
|
|
"grad_norm": 0.12236893897937705,
|
|
"learning_rate": 1.9815459856393767e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 15219173.0,
|
|
"reward": 0.578125,
|
|
"reward_std": 0.16097761690616608,
|
|
"rewards/drgrpo_math_reward/mean": 0.578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4957992732524872,
|
|
"step": 172
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262736431211962e-09,
|
|
"advantages/std": 0.5726727247238159,
|
|
"advantages/var": 0.32795404964259944,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 0.4960573476702509,
|
|
"grad_norm": 0.17657832013900868,
|
|
"learning_rate": 1.9813301941951115e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 15299092.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.1462520956993103,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 173
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 5.26953361251084e-09,
|
|
"advantages/std": 0.618580162525177,
|
|
"advantages/var": 0.3826414174696744,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.4989247311827957,
|
|
"grad_norm": 0.09171230585823344,
|
|
"learning_rate": 1.981113160286219e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 15390886.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.2012200653553009,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 174
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5726932287216187,
|
|
"advantages/var": 0.3279775342235922,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.5017921146953405,
|
|
"grad_norm": 0.11084436188522191,
|
|
"learning_rate": 1.980894884187486e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 15475430.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.172288179397583,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 175
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 8.907268126346096e-10,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.5046594982078854,
|
|
"grad_norm": 0.10981123292961002,
|
|
"learning_rate": 1.9806753661752724e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 15572291.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 176
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.1579757501173404e-08,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.5075268817204301,
|
|
"grad_norm": 0.13723730154725833,
|
|
"learning_rate": 1.980454606527511e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 15644451.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 177
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983467187183905e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.65625,
|
|
"epoch": 0.5103942652329749,
|
|
"grad_norm": 0.08112611069058394,
|
|
"learning_rate": 1.980232605523706e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 15735323.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 178
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962549257704446e-09,
|
|
"advantages/std": 0.4676070809364319,
|
|
"advantages/var": 0.21865638214189076,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 0.5132616487455197,
|
|
"grad_norm": 0.09829344452240353,
|
|
"learning_rate": 1.9800093634449336e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 15830114.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.12019839137792587,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 179
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227997303009033,
|
|
"advantages/var": 0.27331955800269725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.5161290322580645,
|
|
"grad_norm": 0.1380835985674579,
|
|
"learning_rate": 1.9797848805738406e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 15917119.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.14806944131851196,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 180
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227986574172974,
|
|
"advantages/var": 0.27331843619732865,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.5189964157706093,
|
|
"grad_norm": 0.08296385514780048,
|
|
"learning_rate": 1.9795591571946452e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 16025022.0,
|
|
"reward": 0.4296875,
|
|
"reward_std": 0.14677615463733673,
|
|
"rewards/drgrpo_math_reward/mean": 0.4296875,
|
|
"rewards/drgrpo_math_reward/std": 0.4969765841960907,
|
|
"step": 181
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 7.317923683175049e-09,
|
|
"advantages/std": 0.5726968050003052,
|
|
"advantages/var": 0.32798163045755757,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 0.5218637992831541,
|
|
"grad_norm": 0.1014964446395621,
|
|
"learning_rate": 1.9793321935931374e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 16111335.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.17464719712734222,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 182
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.6721679467630723e-09,
|
|
"advantages/std": 0.5227904319763184,
|
|
"advantages/var": 0.27330983576598555,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.515625,
|
|
"epoch": 0.524731182795699,
|
|
"grad_norm": 0.07338541137425778,
|
|
"learning_rate": 1.979103990056675e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 16216919.0,
|
|
"reward": 0.515625,
|
|
"reward_std": 0.13781969249248505,
|
|
"rewards/drgrpo_math_reward/mean": 0.515625,
|
|
"rewards/drgrpo_math_reward/std": 0.5017194747924805,
|
|
"step": 183
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.5275985663082438,
|
|
"grad_norm": 0.09441859618951794,
|
|
"learning_rate": 1.9788745468741884e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 16304021.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 184
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983495876754114e-09,
|
|
"advantages/std": 0.4675905704498291,
|
|
"advantages/var": 0.2186409415735966,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.5304659498207885,
|
|
"grad_norm": 0.07701416297176929,
|
|
"learning_rate": 1.9786438643361754e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 16386297.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.10205792635679245,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 185
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 0.5333333333333333,
|
|
"grad_norm": 0.044088504975623034,
|
|
"learning_rate": 1.978411942734704e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 16464227.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 186
|
|
},
|
|
{
|
|
"advantages/mean": -8.381903171539307e-09,
|
|
"advantages/snr": 1.2675324905058625e-08,
|
|
"advantages/std": 0.661277174949646,
|
|
"advantages/var": 0.4372875021093847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.5362007168458781,
|
|
"grad_norm": 0.11815011962199863,
|
|
"learning_rate": 1.978178782363411e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 16553739.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.2120065838098526,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 187
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.5056222341621311e-09,
|
|
"advantages/std": 0.6185632348060608,
|
|
"advantages/var": 0.3826204754537379,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 0.5390681003584229,
|
|
"grad_norm": 0.10950900442633554,
|
|
"learning_rate": 1.9779443835175006e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 16641094.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.17859894037246704,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 188
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.8994882297977766e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.640625,
|
|
"epoch": 0.5419354838709678,
|
|
"grad_norm": 0.06256870745435893,
|
|
"learning_rate": 1.977708746493746e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 16739866.0,
|
|
"reward": 0.4375,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.4375,
|
|
"rewards/drgrpo_math_reward/std": 0.49802759289741516,
|
|
"step": 189
|
|
},
|
|
{
|
|
"advantages/mean": 5.587935447692871e-09,
|
|
"advantages/snr": 1.1950065676939262e-08,
|
|
"advantages/std": 0.4676070809364319,
|
|
"advantages/var": 0.21865638214189076,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 0.5448028673835126,
|
|
"grad_norm": 0.07119029462239967,
|
|
"learning_rate": 1.977471871590488e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 16835943.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.12019838392734528,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 190
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.493777631223398e-08,
|
|
"advantages/std": 0.4676010012626648,
|
|
"advantages/var": 0.21865069638184664,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.5476702508960574,
|
|
"grad_norm": 0.1244096994086529,
|
|
"learning_rate": 1.977233759107635e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 16909432.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 191
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 8.049325332145131e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.5505376344086022,
|
|
"grad_norm": 0.044683376952152215,
|
|
"learning_rate": 1.9769944093466608e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 16994670.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.08785156905651093,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 192
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.703125,
|
|
"epoch": 0.5534050179211469,
|
|
"grad_norm": 0.12143333829380999,
|
|
"learning_rate": 1.9767538226106077e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 17083910.0,
|
|
"reward": 0.5625,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.5625,
|
|
"rewards/drgrpo_math_reward/std": 0.49802759289741516,
|
|
"step": 193
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.5055932184229277e-09,
|
|
"advantages/std": 0.6185751557350159,
|
|
"advantages/var": 0.38263522329259914,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.5562724014336917,
|
|
"grad_norm": 0.11438931886325104,
|
|
"learning_rate": 1.9765119992040825e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 17176232.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.19567854702472687,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 194
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.781469264794068e-09,
|
|
"advantages/std": 0.5227833986282349,
|
|
"advantages/var": 0.2733024818812879,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.5591397849462365,
|
|
"grad_norm": 0.1271093212760164,
|
|
"learning_rate": 1.9762689394332583e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 17262820.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.13098490238189697,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 195
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.5620071684587814,
|
|
"grad_norm": 0.08724342257879564,
|
|
"learning_rate": 1.9760246436058746e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 17341241.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 196
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5629503101518235e-09,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.5648745519713262,
|
|
"grad_norm": 0.10609677844903032,
|
|
"learning_rate": 1.975779112031234e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 17424364.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 197
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5628117918993365e-09,
|
|
"advantages/std": 0.5228019952774048,
|
|
"advantages/var": 0.2733219262660356,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 0.567741935483871,
|
|
"grad_norm": 0.12751734334843087,
|
|
"learning_rate": 1.9755323450202054e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 17509905.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.14807432889938354,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 198
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917668534980524e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.5706093189964158,
|
|
"grad_norm": 0.08874879314195035,
|
|
"learning_rate": 1.9752843428852203e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 17591545.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 199
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.5734767025089605,
|
|
"grad_norm": 0.07702245326996278,
|
|
"learning_rate": 1.9750351059402755e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 17686347.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 200
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6261990006727604e-09,
|
|
"advantages/std": 0.5726990103721619,
|
|
"advantages/var": 0.32798415648125356,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.5763440860215053,
|
|
"grad_norm": 0.18258259153790793,
|
|
"learning_rate": 1.9747846345009303e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 17766586.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.17806214094161987,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 201
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.011145235853968e-09,
|
|
"advantages/std": 0.618583619594574,
|
|
"advantages/var": 0.3826456944307246,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 0.5792114695340502,
|
|
"grad_norm": 0.11440188062213137,
|
|
"learning_rate": 1.9745329288843074e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 17858219.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.2069891095161438,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 202
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 6.7752778563340424e-09,
|
|
"advantages/std": 0.6185652613639832,
|
|
"advantages/var": 0.3826229825662928,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 0.582078853046595,
|
|
"grad_norm": 0.1044706273434688,
|
|
"learning_rate": 1.974279989409092e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 17945123.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.1820138692855835,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 203
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.9835626514248234e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.5849462365591398,
|
|
"grad_norm": 0.08307099624544009,
|
|
"learning_rate": 1.9740258163955306e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 18019296.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 204
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962814881146116e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.5878136200716846,
|
|
"grad_norm": 0.10386880549735236,
|
|
"learning_rate": 1.9737704101654332e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 18100689.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 205
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40496495366096497,
|
|
"advantages/var": 0.1639966136936275,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 0.5906810035842294,
|
|
"grad_norm": 0.10204646235825936,
|
|
"learning_rate": 1.9735137710421694e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 18185993.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.0936255231499672,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 206
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 1.2674839170101503e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 0.5935483870967742,
|
|
"grad_norm": 0.0425580612992601,
|
|
"learning_rate": 1.973255899350672e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 18265426.0,
|
|
"reward": 0.5390625,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.5390625,
|
|
"rewards/drgrpo_math_reward/std": 0.5004304051399231,
|
|
"step": 207
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.344384639658041e-09,
|
|
"advantages/std": 0.5227856636047363,
|
|
"advantages/var": 0.27330485007064453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.71875,
|
|
"epoch": 0.596415770609319,
|
|
"grad_norm": 0.12298704045791495,
|
|
"learning_rate": 1.9729967954174317e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 18354467.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.13098981976509094,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 208
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.25248601345888e-09,
|
|
"advantages/std": 0.5726835131645203,
|
|
"advantages/var": 0.32796640625045725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.5992831541218638,
|
|
"grad_norm": 0.14043999076034508,
|
|
"learning_rate": 1.972736459570501e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 18442227.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 209
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.6021505376344086,
|
|
"grad_norm": 0.08150097473653,
|
|
"learning_rate": 1.972474892139492e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 18530843.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 210
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 7.527905160729771e-10,
|
|
"advantages/std": 0.618580162525177,
|
|
"advantages/var": 0.3826414174696744,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 0.6050179211469534,
|
|
"grad_norm": 0.1087801091982331,
|
|
"learning_rate": 1.972212093455576e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 18616572.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.2012200653553009,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 211
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.4083154633446115e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.6078853046594982,
|
|
"grad_norm": 0.04257051308921546,
|
|
"learning_rate": 1.9719480638514825e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 18691487.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 212
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599795920412534e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 0.610752688172043,
|
|
"grad_norm": 0.08885381295106154,
|
|
"learning_rate": 1.9716828036615002e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 18770764.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 213
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 8.449892780067669e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 0.6136200716845878,
|
|
"grad_norm": 0.07110255927977205,
|
|
"learning_rate": 1.9714163132214763e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 18848544.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 214
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.9792225930357895e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.6164874551971327,
|
|
"grad_norm": 0.1054824330345328,
|
|
"learning_rate": 1.9711485928688146e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 18926159.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.11678344756364822,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 215
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.1579601710407989e-08,
|
|
"advantages/std": 0.5227810740470886,
|
|
"advantages/var": 0.27330005138182756,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.6193548387096774,
|
|
"grad_norm": 0.09028662906122077,
|
|
"learning_rate": 1.9708796429424763e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 19020277.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 216
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.878573645701288e-09,
|
|
"advantages/std": 0.5727017521858215,
|
|
"advantages/var": 0.32798729695671014,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.6222222222222222,
|
|
"grad_norm": 0.16890927041982728,
|
|
"learning_rate": 1.9706094637829794e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 19096844.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.18253791332244873,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 217
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 7.3180584939108565e-09,
|
|
"advantages/std": 0.5726862549781799,
|
|
"advantages/var": 0.3279695466409329,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.625089605734767,
|
|
"grad_norm": 0.09698327360753146,
|
|
"learning_rate": 1.9703380557323994e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 19184975.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.16439256072044373,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 218
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 5.311215035117412e-09,
|
|
"advantages/std": 0.7014007568359375,
|
|
"advantages/var": 0.4919630216900259,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.6279569892473118,
|
|
"grad_norm": 0.14413514727585514,
|
|
"learning_rate": 1.970065419134366e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 19277171.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.25354230403900146,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 219
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917335935919526e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.6308243727598566,
|
|
"grad_norm": 0.11398432656810822,
|
|
"learning_rate": 1.969791554334065e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 19355220.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 220
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.2470341725316569e-08,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.6336917562724015,
|
|
"grad_norm": 0.06968706164339275,
|
|
"learning_rate": 1.9695164616782378e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 19442603.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.12756997346878052,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 221
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.408356612639279e-09,
|
|
"advantages/std": 0.6612831950187683,
|
|
"advantages/var": 0.43729546401423036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.6365591397849463,
|
|
"grad_norm": 0.12417487379900514,
|
|
"learning_rate": 1.969240141515179e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 19536443.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.21990221738815308,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 222
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5629027821475993e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 0.6394265232974911,
|
|
"grad_norm": 0.11190128164642635,
|
|
"learning_rate": 1.9689625941947394e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 19613563.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 223
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 8.962814881146116e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 0.6422939068100358,
|
|
"grad_norm": 0.10184327264367528,
|
|
"learning_rate": 1.9686838200683217e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 19682932.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.1054728701710701,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 224
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 8.131062703721143e-10,
|
|
"advantages/std": 0.5726942420005798,
|
|
"advantages/var": 0.3279786948206187,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.6451612903225806,
|
|
"grad_norm": 0.13523562258217606,
|
|
"learning_rate": 1.9684038194888825e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 19774164.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.17358146607875824,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 225
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599693686636838e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.6480286738351254,
|
|
"grad_norm": 0.08049877908916266,
|
|
"learning_rate": 1.9681225928109316e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 19854147.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 226
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.958350617856365e-10,
|
|
"advantages/std": 0.46760883927345276,
|
|
"advantages/var": 0.21865802656666578,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 0.6508960573476702,
|
|
"grad_norm": 0.10117609949737459,
|
|
"learning_rate": 1.9678401403905304e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 19941487.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.11914245784282684,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 227
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.065478038084407e-09,
|
|
"advantages/std": 0.5727017521858215,
|
|
"advantages/var": 0.32798729695671014,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 0.6537634408602151,
|
|
"grad_norm": 0.13237339755186903,
|
|
"learning_rate": 1.967556462585293e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 20033133.0,
|
|
"reward": 0.6171875,
|
|
"reward_std": 0.18253791332244873,
|
|
"rewards/drgrpo_math_reward/mean": 0.6171875,
|
|
"rewards/drgrpo_math_reward/std": 0.4879830479621887,
|
|
"step": 228
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.149905988405531e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.6566308243727599,
|
|
"grad_norm": 0.08008024137443356,
|
|
"learning_rate": 1.967271559754384e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 20108285.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 229
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.439373903985093e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.6594982078853047,
|
|
"grad_norm": 0.09014607484840405,
|
|
"learning_rate": 1.9669854322585205e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 20192884.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 230
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.131207417015344e-09,
|
|
"advantages/std": 0.5726840496063232,
|
|
"advantages/var": 0.3279670206734977,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.6623655913978495,
|
|
"grad_norm": 0.12403951610663785,
|
|
"learning_rate": 1.9666980804599685e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 20269154.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.16097761690616608,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 231
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 0.6652329749103942,
|
|
"grad_norm": 0.06451461003988601,
|
|
"learning_rate": 1.966409504722545e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 20346655.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 232
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 6.899267866969296e-09,
|
|
"advantages/std": 0.4049658179283142,
|
|
"advantages/var": 0.16399731369034853,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.71875,
|
|
"epoch": 0.668100358422939,
|
|
"grad_norm": 0.07448062664269792,
|
|
"learning_rate": 1.9661197054116164e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 20430353.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.094686359167099,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 233
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.02242626209624e-09,
|
|
"advantages/std": 0.6185696721076965,
|
|
"advantages/var": 0.3826284392514232,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.6709677419354839,
|
|
"grad_norm": 0.139481185743962,
|
|
"learning_rate": 1.9658286828940987e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 20522660.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.18649455904960632,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 234
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.6738351254480287,
|
|
"grad_norm": 0.089851647052127,
|
|
"learning_rate": 1.965536437538456e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 20614712.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 235
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.6767025089605735,
|
|
"grad_norm": 0.02091972713635081,
|
|
"learning_rate": 1.9652429697147003e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 20696075.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 236
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966768334962982e-09,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.6795698924731183,
|
|
"grad_norm": 0.0821111362604408,
|
|
"learning_rate": 1.964948279794393e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 20775211.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 237
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.6824372759856631,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.9646523681506414e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 20844029.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 238
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.225206177997688e-09,
|
|
"advantages/std": 0.6612618565559387,
|
|
"advantages/var": 0.4372672429358069,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.6853046594982078,
|
|
"grad_norm": 0.11245554500128893,
|
|
"learning_rate": 1.9643552351580997e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 20926989.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.19044628739356995,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 239
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.90714930920747e-09,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 0.6881720430107527,
|
|
"grad_norm": 0.09191890580990122,
|
|
"learning_rate": 1.964056881192969e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 21011593.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 240
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.3443115207936544e-09,
|
|
"advantages/std": 0.5227928161621094,
|
|
"advantages/var": 0.2733123286307091,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 0.6910394265232975,
|
|
"grad_norm": 0.12594432298115876,
|
|
"learning_rate": 1.963757306632996e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 21098836.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.1412346363067627,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 241
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 7.528014692854944e-10,
|
|
"advantages/std": 0.6185711622238159,
|
|
"advantages/var": 0.3826302827349224,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.6939068100358423,
|
|
"grad_norm": 0.15637871712059767,
|
|
"learning_rate": 1.963456511857472e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 21178936.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.18884867429733276,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 242
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998246708054356e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.6967741935483871,
|
|
"grad_norm": 0.060148920256316504,
|
|
"learning_rate": 1.9631544972472355e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 21255305.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 243
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 5.749701738269761e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 0.6996415770609319,
|
|
"grad_norm": 0.08951489657922204,
|
|
"learning_rate": 1.962851263184667e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 21354786.0,
|
|
"reward": 0.4609375,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.4609375,
|
|
"rewards/drgrpo_math_reward/std": 0.5004304051399231,
|
|
"step": 244
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.0688659600644362e-08,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.7025089605734767,
|
|
"grad_norm": 0.09867558191702931,
|
|
"learning_rate": 1.9625468100536918e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 21443982.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 245
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599796258942519e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.7053763440860215,
|
|
"grad_norm": 0.07725741015662844,
|
|
"learning_rate": 1.9622411382397793e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 21529714.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 246
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.6721804379038284e-09,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.71875,
|
|
"epoch": 0.7082437275985664,
|
|
"grad_norm": 0.1149381843048165,
|
|
"learning_rate": 1.9619342481299407e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 21626721.0,
|
|
"reward": 0.6015625,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.6015625,
|
|
"rewards/drgrpo_math_reward/std": 0.4915000796318054,
|
|
"step": 247
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.563018557708836e-09,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.7111111111111111,
|
|
"grad_norm": 0.11302622859074207,
|
|
"learning_rate": 1.9616261401127316e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 21713279.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 248
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.057046292582203e-08,
|
|
"advantages/std": 0.5726898312568665,
|
|
"advantages/var": 0.3279736428250182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 0.7139784946236559,
|
|
"grad_norm": 0.12328874251473232,
|
|
"learning_rate": 1.9613168145782465e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 21797331.0,
|
|
"reward": 0.59375,
|
|
"reward_std": 0.16675156354904175,
|
|
"rewards/drgrpo_math_reward/mean": 0.59375,
|
|
"rewards/drgrpo_math_reward/std": 0.4930621087551117,
|
|
"step": 249
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 7.041958882220529e-09,
|
|
"advantages/std": 0.6612666845321655,
|
|
"advantages/var": 0.4372736280721625,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.7168458781362007,
|
|
"grad_norm": 0.11302415709130216,
|
|
"learning_rate": 1.9610062719181248e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 21883873.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.19939783215522766,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 250
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 0.7197132616487455,
|
|
"grad_norm": 0.08517678527092525,
|
|
"learning_rate": 1.9606945125255447e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 21962730.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 251
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.1383618110974619e-08,
|
|
"advantages/std": 0.5726876854896545,
|
|
"advantages/var": 0.3279711851114975,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.7225806451612903,
|
|
"grad_norm": 0.0913818639821884,
|
|
"learning_rate": 1.9603815367952253e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 22054023.0,
|
|
"reward": 0.5859375,
|
|
"reward_std": 0.1633366346359253,
|
|
"rewards/drgrpo_math_reward/mean": 0.5859375,
|
|
"rewards/drgrpo_math_reward/std": 0.49449479579925537,
|
|
"step": 252
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 8.907505770133387e-10,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 0.7254480286738352,
|
|
"grad_norm": 0.10096835309765015,
|
|
"learning_rate": 1.9600673451234268e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 22125045.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 253
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.701427161693573,
|
|
"advantages/var": 0.4920000631615018,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.53125,
|
|
"epoch": 0.72831541218638,
|
|
"grad_norm": 0.13821949511592338,
|
|
"learning_rate": 1.9597519379079476e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 22236387.0,
|
|
"reward": 0.5234375,
|
|
"reward_std": 0.29536473751068115,
|
|
"rewards/drgrpo_math_reward/mean": 0.5234375,
|
|
"rewards/drgrpo_math_reward/std": 0.5014128684997559,
|
|
"step": 254
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 7.966686079472842e-09,
|
|
"advantages/std": 0.4676085114479065,
|
|
"advantages/var": 0.2186577199785269,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.7311827956989247,
|
|
"grad_norm": 0.08267618108141793,
|
|
"learning_rate": 1.959435315548125e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 22315025.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.12232004851102829,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 255
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.7340501792114695,
|
|
"grad_norm": 0.10328968332091389,
|
|
"learning_rate": 1.959117478444836e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 22394444.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 256
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 7.317958718255826e-09,
|
|
"advantages/std": 0.5726940631866455,
|
|
"advantages/var": 0.3279784900092295,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.7369175627240143,
|
|
"grad_norm": 0.12066286064952439,
|
|
"learning_rate": 1.9587984270004948e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 22477849.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.17017142474651337,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 257
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.8996420859237135e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 0.7397849462365591,
|
|
"grad_norm": 0.13435211185718615,
|
|
"learning_rate": 1.9584781616190534e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 22554506.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 258
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 0.7426523297491039,
|
|
"grad_norm": 0.06223946926258825,
|
|
"learning_rate": 1.9581566827060006e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 22621556.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 259
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958459784009791e-09,
|
|
"advantages/std": 0.4676037132740021,
|
|
"advantages/var": 0.21865323266763514,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.7455197132616488,
|
|
"grad_norm": 0.09677817546646107,
|
|
"learning_rate": 1.9578339906683615e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 22707125.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 260
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016434378286722e-09,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.7483870967741936,
|
|
"grad_norm": 0.10330998272932007,
|
|
"learning_rate": 1.9575100859146973e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 22787207.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 261
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.4536934733273465e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.7512544802867384,
|
|
"grad_norm": 0.11445295703033564,
|
|
"learning_rate": 1.9571849688551045e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 22879810.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.12756997346878052,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 262
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814855139419146e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.7541218637992831,
|
|
"grad_norm": 0.1431856830147952,
|
|
"learning_rate": 1.956858639901215e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 22958989.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.12415501475334167,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 263
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 3.0026272842061405e-09,
|
|
"advantages/std": 0.7754230499267578,
|
|
"advantages/var": 0.6012809063577151,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 0.7569892473118279,
|
|
"grad_norm": 0.17368791123183652,
|
|
"learning_rate": 1.9565310994661943e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 23057071.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.30115634202957153,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 264
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.975176026781512e-09,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.7598566308243727,
|
|
"grad_norm": 0.07708908665544316,
|
|
"learning_rate": 1.9562023479647423e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 23130250.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 265
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.906938086565493e-10,
|
|
"advantages/std": 0.5228073596954346,
|
|
"advantages/var": 0.2733275353517115,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.7627240143369176,
|
|
"grad_norm": 0.12603516044479746,
|
|
"learning_rate": 1.955872385813092e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 23221313.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.15596505999565125,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 266
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.7655913978494624,
|
|
"grad_norm": 0.057732338064037186,
|
|
"learning_rate": 1.95554121342901e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 23303763.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 267
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.0689055673126508e-08,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.7684587813620072,
|
|
"grad_norm": 0.09102252601234391,
|
|
"learning_rate": 1.955208831231794e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 23388152.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 268
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.065623173308489e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.771326164874552,
|
|
"grad_norm": 0.10980681592679174,
|
|
"learning_rate": 1.9548752396422735e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 23475335.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 269
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.6185775995254517,
|
|
"advantages/var": 0.38263824663467005,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 0.7741935483870968,
|
|
"grad_norm": 0.12735926909571454,
|
|
"learning_rate": 1.9545404390828105e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 23572561.0,
|
|
"reward": 0.6171875,
|
|
"reward_std": 0.19674429297447205,
|
|
"rewards/drgrpo_math_reward/mean": 0.6171875,
|
|
"rewards/drgrpo_math_reward/std": 0.4879830479621887,
|
|
"step": 270
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.0688419534073913e-08,
|
|
"advantages/std": 0.5228027701377869,
|
|
"advantages/var": 0.2733227364637436,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 0.7770609318996415,
|
|
"grad_norm": 0.09015350223423653,
|
|
"learning_rate": 1.9542044299772958e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 23667633.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.15254521369934082,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 271
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.7799283154121864,
|
|
"grad_norm": 0.070569427294079,
|
|
"learning_rate": 1.9538672127511523e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 23743563.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 272
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.974933067692274e-09,
|
|
"advantages/std": 0.46761488914489746,
|
|
"advantages/var": 0.21866368454999474,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.7827956989247312,
|
|
"grad_norm": 0.12482820251144082,
|
|
"learning_rate": 1.9535287878313314e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 23811169.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.12703317403793335,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 273
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.757495615940373e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.785663082437276,
|
|
"grad_norm": 0.11090556503992859,
|
|
"learning_rate": 1.953189155646313e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 23895942.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 274
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 9.786451163004381e-09,
|
|
"advantages/std": 0.6185691356658936,
|
|
"advantages/var": 0.3826277755984506,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.7885304659498208,
|
|
"grad_norm": 0.13384615172927988,
|
|
"learning_rate": 1.952848316626108e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 23994424.0,
|
|
"reward": 0.578125,
|
|
"reward_std": 0.18543371558189392,
|
|
"rewards/drgrpo_math_reward/mean": 0.578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4957992732524872,
|
|
"step": 275
|
|
},
|
|
{
|
|
"advantages/mean": -3.4924596548080444e-09,
|
|
"advantages/snr": 4.723789219088827e-09,
|
|
"advantages/std": 0.7393343448638916,
|
|
"advantages/var": 0.5466152734953198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 0.7913978494623656,
|
|
"grad_norm": 0.16870213729650185,
|
|
"learning_rate": 1.9525062712022515e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 24090030.0,
|
|
"reward": 0.6171875,
|
|
"reward_std": 0.2698703408241272,
|
|
"rewards/drgrpo_math_reward/mean": 0.6171875,
|
|
"rewards/drgrpo_math_reward/std": 0.4879830479621887,
|
|
"step": 276
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.878797063112294e-09,
|
|
"advantages/std": 0.5726755261421204,
|
|
"advantages/var": 0.3279572582421544,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 0.7942652329749104,
|
|
"grad_norm": 0.11898481530527014,
|
|
"learning_rate": 1.952163019807809e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 24164254.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.15072788298130035,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 277
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 7.746009544294893e-09,
|
|
"advantages/std": 0.6612790822982788,
|
|
"advantages/var": 0.4372900246852538,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.7971326164874551,
|
|
"grad_norm": 0.13360700941196924,
|
|
"learning_rate": 1.951818562877372e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 24261394.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.21542152762413025,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 278
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.2596376216935218e-09,
|
|
"advantages/std": 0.7393575310707092,
|
|
"advantages/var": 0.5466495587509748,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.8,
|
|
"grad_norm": 0.1580545242588768,
|
|
"learning_rate": 1.951472900847058e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 24349607.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.30744943022727966,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 279
|
|
},
|
|
{
|
|
"advantages/mean": -8.381903171539307e-09,
|
|
"advantages/snr": 1.4635969227931852e-08,
|
|
"advantages/std": 0.5726920366287231,
|
|
"advantages/var": 0.32797616881795477,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.8028673835125448,
|
|
"grad_norm": 0.10266306757635575,
|
|
"learning_rate": 1.9511260341545107e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 24433195.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.1701665222644806,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 280
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.0111426245614905e-09,
|
|
"advantages/std": 0.618584156036377,
|
|
"advantages/var": 0.38264635809923675,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.8057347670250896,
|
|
"grad_norm": 0.13785243356693017,
|
|
"learning_rate": 1.9507779632388995e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 24512800.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.208049938082695,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 281
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.562971027883829e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.71875,
|
|
"epoch": 0.8086021505376344,
|
|
"grad_norm": 0.10594585005660433,
|
|
"learning_rate": 1.950428688540917e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 24604862.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.12415501475334167,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 282
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.8114695340501792,
|
|
"grad_norm": 0.05473787045146346,
|
|
"learning_rate": 1.9500782105027807e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 24695761.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 283
|
|
},
|
|
{
|
|
"advantages/mean": 4.6566128730773926e-09,
|
|
"advantages/snr": 8.131246346616979e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.814336917562724,
|
|
"grad_norm": 0.18183527260060542,
|
|
"learning_rate": 1.9497265295682326e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 24780624.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 284
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.504997077293582e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 0.8172043010752689,
|
|
"grad_norm": 0.11798162586925885,
|
|
"learning_rate": 1.9493736461825363e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 24864352.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 285
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.609375,
|
|
"epoch": 0.8200716845878137,
|
|
"grad_norm": 0.03640262071234597,
|
|
"learning_rate": 1.9490195607924782e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 24959916.0,
|
|
"reward": 0.578125,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4957992732524872,
|
|
"step": 286
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.8229390681003584,
|
|
"grad_norm": 0.06500180531661805,
|
|
"learning_rate": 1.948664273846367e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 25045434.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 287
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 6.0224889366485245e-09,
|
|
"advantages/std": 0.6185632348060608,
|
|
"advantages/var": 0.3826204754537379,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 0.8258064516129032,
|
|
"grad_norm": 0.12945246604514526,
|
|
"learning_rate": 1.9483077857940326e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 25134360.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.17859894037246704,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 288
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.958683201273463e-10,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.828673835125448,
|
|
"grad_norm": 0.07853132739771744,
|
|
"learning_rate": 1.9479500970868246e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 25214310.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 289
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998246708054356e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.8315412186379928,
|
|
"grad_norm": 0.09141425024311402,
|
|
"learning_rate": 1.9475912081776144e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 25302858.0,
|
|
"reward": 0.5234375,
|
|
"reward_std": 0.08443662524223328,
|
|
"rewards/drgrpo_math_reward/mean": 0.5234375,
|
|
"rewards/drgrpo_math_reward/std": 0.5014128684997559,
|
|
"step": 290
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1499147049662961e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.8344086021505376,
|
|
"grad_norm": 0.09590480521856211,
|
|
"learning_rate": 1.9472311195207915e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 25386803.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 291
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9916330589501627e-09,
|
|
"advantages/std": 0.46761754155158997,
|
|
"advantages/var": 0.21866616516675297,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 0.8372759856630825,
|
|
"grad_norm": 0.11322544781915576,
|
|
"learning_rate": 1.9468698315722655e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 25468775.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.130448117852211,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 292
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.453692965541534e-09,
|
|
"advantages/std": 0.5227810740470886,
|
|
"advantages/var": 0.27330005138182756,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.8401433691756273,
|
|
"grad_norm": 0.08298753538962973,
|
|
"learning_rate": 1.946507344789464e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 25552470.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.12756997346878052,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 293
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.5056146899623504e-09,
|
|
"advantages/std": 0.6185663342475891,
|
|
"advantages/var": 0.38262430986450013,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.843010752688172,
|
|
"grad_norm": 0.12742568170963892,
|
|
"learning_rate": 1.9461436596313317e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 25640616.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.1841355264186859,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 294
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.6721649001610904e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 0.8458781362007168,
|
|
"grad_norm": 0.13611944421932107,
|
|
"learning_rate": 1.9457787765583325e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 25720686.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 295
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125686138545943e-09,
|
|
"advantages/std": 0.5227974057197571,
|
|
"advantages/var": 0.2733171274273083,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.8487455197132616,
|
|
"grad_norm": 0.0888802118140364,
|
|
"learning_rate": 1.945412696032445e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 25809142.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.14465448260307312,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 296
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.033674212111923e-09,
|
|
"advantages/std": 0.6185672879219055,
|
|
"advantages/var": 0.38262548968706156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.8516129032258064,
|
|
"grad_norm": 0.11517617807560514,
|
|
"learning_rate": 1.9450454185171647e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 25886208.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.18542881309986115,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 297
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 0.8544802867383513,
|
|
"grad_norm": 0.09118794291826457,
|
|
"learning_rate": 1.944676944477503e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 25962561.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 298
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814432667740602e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.8573476702508961,
|
|
"grad_norm": 0.11843588600056001,
|
|
"learning_rate": 1.944307274379985e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 26033077.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 299
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.9792508369915716e-09,
|
|
"advantages/std": 0.4676017463207245,
|
|
"advantages/var": 0.21865139316219118,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 0.8602150537634409,
|
|
"grad_norm": 0.06641757839953512,
|
|
"learning_rate": 1.943936408692652e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 26124611.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.11336850374937057,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 300
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917699002625455e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.8630824372759857,
|
|
"grad_norm": 0.088638599805281,
|
|
"learning_rate": 1.9435643478850574e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 26213659.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 301
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.4536934733273465e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.8659498207885304,
|
|
"grad_norm": 0.11418468041650211,
|
|
"learning_rate": 1.9431910924282677e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 26303162.0,
|
|
"reward": 0.609375,
|
|
"reward_std": 0.12756997346878052,
|
|
"rewards/drgrpo_math_reward/mean": 0.609375,
|
|
"rewards/drgrpo_math_reward/std": 0.4898075461387634,
|
|
"step": 302
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983349132682101e-09,
|
|
"advantages/std": 0.4676077961921692,
|
|
"advantages/var": 0.21865705105969724,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.71875,
|
|
"epoch": 0.8688172043010752,
|
|
"grad_norm": 0.08943390980926356,
|
|
"learning_rate": 1.942816642794864e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 26395398.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.12125921249389648,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 303
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.95862671130252e-09,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.8716845878136201,
|
|
"grad_norm": 0.10227841262538512,
|
|
"learning_rate": 1.942440999458937e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 26482466.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.10888781398534775,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 304
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.4497441148988883e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 0.8745519713261649,
|
|
"grad_norm": 0.06312615865790253,
|
|
"learning_rate": 1.9420641628960895e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 26545949.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 305
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.9875335558214736e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.8774193548387097,
|
|
"grad_norm": 0.14769399231288133,
|
|
"learning_rate": 1.9416861335834354e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 26635345.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 306
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 2.8167388677908054e-09,
|
|
"advantages/std": 0.661277174949646,
|
|
"advantages/var": 0.4372875021093847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.8802867383512545,
|
|
"grad_norm": 0.1719249628407586,
|
|
"learning_rate": 1.9413069119995994e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 26724453.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.2120065838098526,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 307
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.8831541218637993,
|
|
"grad_norm": 0.051521206027291225,
|
|
"learning_rate": 1.9409264986247136e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 26810148.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 308
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.130935766475913e-10,
|
|
"advantages/std": 0.5727031826972961,
|
|
"advantages/var": 0.32798893547161256,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 0.886021505376344,
|
|
"grad_norm": 0.1586860930519003,
|
|
"learning_rate": 1.9405448939404213e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 26887753.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.1814819872379303,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 309
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.5056415302478586e-09,
|
|
"advantages/std": 0.6185553073883057,
|
|
"advantages/var": 0.3826106682982413,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.8888888888888888,
|
|
"grad_norm": 0.11676973725591522,
|
|
"learning_rate": 1.9401620984298726e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 26975280.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.16834920644760132,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 310
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.3361080419982039e-08,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.8917562724014337,
|
|
"grad_norm": 0.10791082633442746,
|
|
"learning_rate": 1.9397781125777263e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 27063327.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 311
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5628865335481204e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.8946236559139785,
|
|
"grad_norm": 0.11106150865967176,
|
|
"learning_rate": 1.9393929368701474e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 27142933.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 312
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.1383310073066278e-08,
|
|
"advantages/std": 0.5727031826972961,
|
|
"advantages/var": 0.32798893547161256,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.8974910394265233,
|
|
"grad_norm": 0.1308299915232991,
|
|
"learning_rate": 1.939006571794808e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 27229159.0,
|
|
"reward": 0.6015625,
|
|
"reward_std": 0.1814819872379303,
|
|
"rewards/drgrpo_math_reward/mean": 0.6015625,
|
|
"rewards/drgrpo_math_reward/std": 0.4915000796318054,
|
|
"step": 313
|
|
},
|
|
{
|
|
"advantages/mean": -1.0710209608078003e-08,
|
|
"advantages/snr": 1.870084863867376e-08,
|
|
"advantages/std": 0.5727124810218811,
|
|
"advantages/var": 0.3279995859182385,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.71875,
|
|
"epoch": 0.9003584229390681,
|
|
"grad_norm": 0.12675411347923377,
|
|
"learning_rate": 1.9386190178408863e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 27321247.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.1962025910615921,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 314
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 5.269560016029535e-09,
|
|
"advantages/std": 0.6185770630836487,
|
|
"advantages/var": 0.3826375829731923,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 0.9032258064516129,
|
|
"grad_norm": 0.11983649469404985,
|
|
"learning_rate": 1.9382302754990644e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 27410537.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.19568344950675964,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 315
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.878505609008981e-09,
|
|
"advantages/std": 0.5727097392082214,
|
|
"advantages/var": 0.327996445383949,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.671875,
|
|
"epoch": 0.9060931899641577,
|
|
"grad_norm": 0.10341377968316127,
|
|
"learning_rate": 1.9378403452615308e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 27506250.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.19172681868076324,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 316
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344431558649841e-09,
|
|
"advantages/std": 0.5227810740470886,
|
|
"advantages/var": 0.27330005138182756,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.9089605734767026,
|
|
"grad_norm": 0.09507098139210479,
|
|
"learning_rate": 1.937449227621977e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 27581956.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 317
|
|
},
|
|
{
|
|
"advantages/mean": 5.587935447692871e-09,
|
|
"advantages/snr": 1.1950152502444473e-08,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.703125,
|
|
"epoch": 0.9118279569892473,
|
|
"grad_norm": 0.07694112355680065,
|
|
"learning_rate": 1.937056923075598e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 27675568.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 318
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.25248601345888e-09,
|
|
"advantages/std": 0.5726835131645203,
|
|
"advantages/var": 0.32796640625045725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.9146953405017921,
|
|
"grad_norm": 0.1045675617841734,
|
|
"learning_rate": 1.936663432119091e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 27755049.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 319
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.9175627240143369,
|
|
"grad_norm": 0.08094397471372518,
|
|
"learning_rate": 1.936268755250657e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 27833337.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.09863808006048203,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 320
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.068867300569461e-08,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.9204301075268817,
|
|
"grad_norm": 0.08290054487635672,
|
|
"learning_rate": 1.935872892969996e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 27913335.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 321
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.6262414834030685e-09,
|
|
"advantages/std": 0.5726840496063232,
|
|
"advantages/var": 0.3279670206734977,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.9232974910394265,
|
|
"grad_norm": 0.14424633415032476,
|
|
"learning_rate": 1.9354758457783118e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 28000444.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.16097761690616608,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 322
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.0570411218700057e-08,
|
|
"advantages/std": 0.5726926326751709,
|
|
"advantages/var": 0.3279768515204182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 0.9261648745519713,
|
|
"grad_norm": 0.10262016914447541,
|
|
"learning_rate": 1.9350776141783053e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 28090617.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.1712273508310318,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 323
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 0.9290322580645162,
|
|
"grad_norm": 0.06264814038661047,
|
|
"learning_rate": 1.9346781986741796e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 28167398.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 324
|
|
},
|
|
{
|
|
"advantages/mean": 4.6566128730773926e-09,
|
|
"advantages/snr": 7.528182986834277e-09,
|
|
"advantages/std": 0.618557333946228,
|
|
"advantages/var": 0.38261317537866546,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 0.931899641577061,
|
|
"grad_norm": 0.09011173223308504,
|
|
"learning_rate": 1.9342775997716356e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 28261498.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.17176413536071777,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 325
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.9347670250896057,
|
|
"grad_norm": 0.07463006151846577,
|
|
"learning_rate": 1.933875817977872e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 28333078.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 326
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983533706996105e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 0.9376344086021505,
|
|
"grad_norm": 0.0995946719653618,
|
|
"learning_rate": 1.9334728538015857e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 28408055.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 327
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 0.9405017921146953,
|
|
"grad_norm": 0.04232695230937809,
|
|
"learning_rate": 1.933068707752972e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 28484536.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 328
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.01628083497369e-09,
|
|
"advantages/std": 0.5228049755096436,
|
|
"advantages/var": 0.273325042417639,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 0.9433691756272401,
|
|
"grad_norm": 0.12948393095836913,
|
|
"learning_rate": 1.9326633803437195e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 28567115.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.1525501012802124,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 329
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.974924309348006e-09,
|
|
"advantages/std": 0.4676155745983124,
|
|
"advantages/var": 0.21866432560690985,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.946236559139785,
|
|
"grad_norm": 0.11737178296005357,
|
|
"learning_rate": 1.932256872087015e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 28646097.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.12809401750564575,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 330
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.907227504745508e-10,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.9491039426523298,
|
|
"grad_norm": 0.09571629711822983,
|
|
"learning_rate": 1.9318491834975396e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 28731611.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 331
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199317639730369e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.9519713261648746,
|
|
"grad_norm": 0.13642467920291784,
|
|
"learning_rate": 1.931440315091469e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 28800915.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 332
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.9548387096774194,
|
|
"grad_norm": 0.053445425827039775,
|
|
"learning_rate": 1.9310302673864724e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 28879905.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 333
|
|
},
|
|
{
|
|
"advantages/mean": -8.381903171539307e-09,
|
|
"advantages/snr": 1.4635862598936842e-08,
|
|
"advantages/std": 0.5726962089538574,
|
|
"advantages/var": 0.3279809477501203,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.9577060931899641,
|
|
"grad_norm": 0.11806438466355382,
|
|
"learning_rate": 1.930619040901712e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 28957308.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.17358636856079102,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 334
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 0.9605734767025089,
|
|
"grad_norm": 0.08505699495761096,
|
|
"learning_rate": 1.930206636157843e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 29038618.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 335
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 3.764028020123724e-09,
|
|
"advantages/std": 0.6185677647590637,
|
|
"advantages/var": 0.3826260795990244,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 0.9634408602150538,
|
|
"grad_norm": 0.14199404018098058,
|
|
"learning_rate": 1.929793053677012e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 29126806.0,
|
|
"reward": 0.59375,
|
|
"reward_std": 0.18648964166641235,
|
|
"rewards/drgrpo_math_reward/mean": 0.59375,
|
|
"rewards/drgrpo_math_reward/std": 0.4930621087551117,
|
|
"step": 336
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.234939828648251e-09,
|
|
"advantages/std": 0.5228003859519958,
|
|
"advantages/var": 0.2733202435515558,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 0.9663082437275986,
|
|
"grad_norm": 0.12599572121905642,
|
|
"learning_rate": 1.929378293982857e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 29212479.0,
|
|
"reward": 0.6171875,
|
|
"reward_std": 0.14913025498390198,
|
|
"rewards/drgrpo_math_reward/mean": 0.6171875,
|
|
"rewards/drgrpo_math_reward/std": 0.4879830479621887,
|
|
"step": 337
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.2525831708228704e-09,
|
|
"advantages/std": 0.5726664066314697,
|
|
"advantages/var": 0.32794681328419983,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 0.9691756272401434,
|
|
"grad_norm": 0.111670356484796,
|
|
"learning_rate": 1.928962357600506e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 29298954.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.13941730558872223,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 338
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 0.9720430107526882,
|
|
"grad_norm": 0.16320620762576749,
|
|
"learning_rate": 1.928545245056577e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 29376003.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.1173202320933342,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 339
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814855139419146e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 0.974910394265233,
|
|
"grad_norm": 0.08899428414824657,
|
|
"learning_rate": 1.9281269568791776e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 29464959.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.12415502220392227,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 340
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.9777777777777777,
|
|
"grad_norm": 0.07683186211082983,
|
|
"learning_rate": 1.9277074935979034e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 29540813.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 341
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199522104181912e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 0.9806451612903225,
|
|
"grad_norm": 0.11069940579873279,
|
|
"learning_rate": 1.9272868557438377e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 29613965.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 342
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 0.9835125448028674,
|
|
"grad_norm": 0.07745781028622259,
|
|
"learning_rate": 1.9268650438495512e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 29691372.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 343
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 6.9708058921609245e-09,
|
|
"advantages/std": 0.46761149168014526,
|
|
"advantages/var": 0.21866050715133056,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 0.9863799283154122,
|
|
"grad_norm": 0.09177326008077252,
|
|
"learning_rate": 1.9264420584491013e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 29767997.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.12255740165710449,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 344
|
|
},
|
|
{
|
|
"advantages/mean": 5.122274160385132e-09,
|
|
"advantages/snr": 9.797750300383811e-09,
|
|
"advantages/std": 0.5228010416030884,
|
|
"advantages/var": 0.27332092910127415,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.71875,
|
|
"epoch": 0.989247311827957,
|
|
"grad_norm": 0.14473041714484297,
|
|
"learning_rate": 1.9260179000780308e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 29870801.0,
|
|
"reward": 0.484375,
|
|
"reward_std": 0.15019109845161438,
|
|
"rewards/drgrpo_math_reward/mean": 0.484375,
|
|
"rewards/drgrpo_math_reward/std": 0.5017194747924805,
|
|
"step": 345
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.053926931635423e-08,
|
|
"advantages/std": 0.6185683012008667,
|
|
"advantages/var": 0.38262674325052615,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 0.9921146953405018,
|
|
"grad_norm": 0.1201267373241665,
|
|
"learning_rate": 1.9255925692733675e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 29962857.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.18755048513412476,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 346
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.3361080419982039e-08,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 0.9949820788530466,
|
|
"grad_norm": 0.10345892575163858,
|
|
"learning_rate": 1.925166066573624e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 30042140.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 347
|
|
},
|
|
{
|
|
"advantages/mean": 7.450580596923828e-09,
|
|
"advantages/snr": 1.4251481140387377e-08,
|
|
"advantages/std": 0.5227934122085571,
|
|
"advantages/var": 0.27331295184866633,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 0.9978494623655914,
|
|
"grad_norm": 0.09853084355490387,
|
|
"learning_rate": 1.9247383925187957e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 30136592.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.1422954797744751,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 348
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.987550502194943e-09,
|
|
"advantages/std": 0.4676017463207245,
|
|
"advantages/var": 0.21865139316219118,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 1.0028673835125448,
|
|
"grad_norm": 0.09222286368221157,
|
|
"learning_rate": 1.924309547650363e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 30226361.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.11336850374937057,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 349
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.0057347670250896,
|
|
"grad_norm": 0.09281704064489878,
|
|
"learning_rate": 1.9238795325112867e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 30301423.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 350
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 4.5995914574054795e-09,
|
|
"advantages/std": 0.4049588143825531,
|
|
"advantages/var": 0.1639916413461231,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.0086021505376344,
|
|
"grad_norm": 0.11214316964016033,
|
|
"learning_rate": 1.9234483476460102e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 30378627.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.0867956355214119,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 351
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 6.639167740785514e-09,
|
|
"advantages/std": 0.7013850212097168,
|
|
"advantages/var": 0.4919409479773549,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.0114695340501791,
|
|
"grad_norm": 0.14615136733815426,
|
|
"learning_rate": 1.9230159936004578e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 30465853.0,
|
|
"reward": 0.59375,
|
|
"reward_std": 0.22962790727615356,
|
|
"rewards/drgrpo_math_reward/mean": 0.59375,
|
|
"rewards/drgrpo_math_reward/std": 0.4930621087551117,
|
|
"step": 352
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 1.014336917562724,
|
|
"grad_norm": 0.09620376443450143,
|
|
"learning_rate": 1.922582470922034e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 30555585.0,
|
|
"reward": 0.53125,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.53125,
|
|
"rewards/drgrpo_math_reward/std": 0.5009832978248596,
|
|
"step": 353
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.9750579720916185e-09,
|
|
"advantages/std": 0.4676051139831543,
|
|
"advantages/var": 0.21865454262319872,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.0172043010752687,
|
|
"grad_norm": 0.09875036348818256,
|
|
"learning_rate": 1.922147780159623e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 30636497.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.11784426867961884,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 354
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1499234216592094e-08,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.0200716845878137,
|
|
"grad_norm": 0.10569425566424312,
|
|
"learning_rate": 1.921711921863588e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 30721054.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 355
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.0570579543741361e-08,
|
|
"advantages/std": 0.5726835131645203,
|
|
"advantages/var": 0.32796640625045725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.0229390681003585,
|
|
"grad_norm": 0.09538101147812049,
|
|
"learning_rate": 1.9212748965857696e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 30809381.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 356
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.344354173221399e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.0258064516129033,
|
|
"grad_norm": 0.13437854088249473,
|
|
"learning_rate": 1.9208367048794875e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 30894319.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 357
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.065577896776649e-09,
|
|
"advantages/std": 0.5726876854896545,
|
|
"advantages/var": 0.3279711851114975,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 1.028673835125448,
|
|
"grad_norm": 0.10619439403681721,
|
|
"learning_rate": 1.9203973472995368e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 30985248.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.1633366346359253,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 358
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 8.049402934764073e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.0315412186379929,
|
|
"grad_norm": 0.0571904181498558,
|
|
"learning_rate": 1.9199568244021894e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 31060470.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 359
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.967079601050182e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.0344086021505376,
|
|
"grad_norm": 0.07861517559921144,
|
|
"learning_rate": 1.9195151367451928e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 31140229.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 360
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.971194650918909e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.0372759856630824,
|
|
"grad_norm": 0.06784142929071403,
|
|
"learning_rate": 1.9190722848877683e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 31225829.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 361
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.252446745927492e-09,
|
|
"advantages/std": 0.5726904273033142,
|
|
"advantages/var": 0.3279743255248526,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.0401433691756272,
|
|
"grad_norm": 0.10398935060390492,
|
|
"learning_rate": 1.9186282693906115e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 31309772.0,
|
|
"reward": 0.6015625,
|
|
"reward_std": 0.16781240701675415,
|
|
"rewards/drgrpo_math_reward/mean": 0.6015625,
|
|
"rewards/drgrpo_math_reward/std": 0.4915000796318054,
|
|
"step": 362
|
|
},
|
|
{
|
|
"advantages/mean": 7.916241884231567e-09,
|
|
"advantages/snr": 1.5141977736956797e-08,
|
|
"advantages/std": 0.5228010416030884,
|
|
"advantages/var": 0.27332092910127415,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.043010752688172,
|
|
"grad_norm": 0.11527620657387849,
|
|
"learning_rate": 1.9181830908158926e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 31394106.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.15019109845161438,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 363
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 4.599761052090956e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.0458781362007168,
|
|
"grad_norm": 0.1078137751536225,
|
|
"learning_rate": 1.9177367497272524e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 31473779.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 364
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262414834030685e-09,
|
|
"advantages/std": 0.5726840496063232,
|
|
"advantages/var": 0.3279670206734977,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.0487455197132616,
|
|
"grad_norm": 0.09935113929320916,
|
|
"learning_rate": 1.9172892466898046e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 31554710.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.16097760200500488,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 365
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.0516129032258064,
|
|
"grad_norm": 0.12190199299698741,
|
|
"learning_rate": 1.916840582270134e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 31633384.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 366
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5726947784423828,
|
|
"advantages/var": 0.32797930925516994,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.0544802867383511,
|
|
"grad_norm": 0.0931859343367811,
|
|
"learning_rate": 1.916390757036296e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 31725141.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.17464229464530945,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 367
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.505062750816392e-09,
|
|
"advantages/std": 0.5726755261421204,
|
|
"advantages/var": 0.3279572582421544,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.0573476702508962,
|
|
"grad_norm": 0.12607688041530746,
|
|
"learning_rate": 1.9159397715578158e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 31805654.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.15072786808013916,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 368
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 1.060215053763441,
|
|
"grad_norm": 0.10770929354909455,
|
|
"learning_rate": 1.915487626405686e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 31896248.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 369
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299811976811062e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.0630824372759857,
|
|
"grad_norm": 0.07469731951388661,
|
|
"learning_rate": 1.9150343221523694e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 31965584.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 370
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.131329284837406e-09,
|
|
"advantages/std": 0.5726754665374756,
|
|
"advantages/var": 0.3279571899739153,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 1.0659498207885305,
|
|
"grad_norm": 0.1006389283231685,
|
|
"learning_rate": 1.914579859371796e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 32051745.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.15072788298130035,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 371
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814855139419146e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.0688172043010753,
|
|
"grad_norm": 0.1279205858622629,
|
|
"learning_rate": 1.914124238639362e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 32147060.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.12415502220392227,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 372
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.011311787641491e-09,
|
|
"advantages/std": 0.6185494065284729,
|
|
"advantages/var": 0.38260336831672603,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.07168458781362,
|
|
"grad_norm": 0.1263450693937804,
|
|
"learning_rate": 1.91366746053193e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 32233181.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.16151440143585205,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 373
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.878778782737308e-09,
|
|
"advantages/std": 0.5726776719093323,
|
|
"advantages/var": 0.3279597159034928,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.0745519713261649,
|
|
"grad_norm": 0.1320136897278803,
|
|
"learning_rate": 1.913209525627828e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 32318847.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.154142826795578,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 374
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.0774193548387097,
|
|
"grad_norm": 0.03337057156737115,
|
|
"learning_rate": 1.912750434506848e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 32399870.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 375
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.4497441148988883e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.0802867383512544,
|
|
"grad_norm": 0.0909052441690691,
|
|
"learning_rate": 1.912290187750247e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 32481002.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 376
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.958683201273463e-10,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.0831541218637992,
|
|
"grad_norm": 0.08128416112795446,
|
|
"learning_rate": 1.9118287859407446e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 32561635.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 377
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.086021505376344,
|
|
"grad_norm": 0.0686112977829753,
|
|
"learning_rate": 1.9113662296625223e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 32633079.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 378
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199591840825068e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.0888888888888888,
|
|
"grad_norm": 0.07426607789781772,
|
|
"learning_rate": 1.9109025195012243e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 32719141.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 379
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.878856475277099e-09,
|
|
"advantages/std": 0.5726685523986816,
|
|
"advantages/var": 0.3279492709064016,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.0917562724014336,
|
|
"grad_norm": 0.15765355244245488,
|
|
"learning_rate": 1.9104376560439544e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 32800653.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.14283226430416107,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 380
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.0111551007775335e-09,
|
|
"advantages/std": 0.6185815930366516,
|
|
"advantages/var": 0.3826431872437617,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.0946236559139786,
|
|
"grad_norm": 0.1122812046090102,
|
|
"learning_rate": 1.9099716398792783e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 32880981.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.20357418060302734,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 381
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.1499403476539522e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.0974910394265234,
|
|
"grad_norm": 0.11363141288519696,
|
|
"learning_rate": 1.90950447159722e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 32959971.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 382
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.149914704966296e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.1003584229390682,
|
|
"grad_norm": 0.09497901297409572,
|
|
"learning_rate": 1.9090361517892617e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 33033418.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 383
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907175712742112e-09,
|
|
"advantages/std": 0.5227934122085571,
|
|
"advantages/var": 0.27331295184866633,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.703125,
|
|
"epoch": 1.103225806451613,
|
|
"grad_norm": 0.06460150146892275,
|
|
"learning_rate": 1.9085666810483457e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 33129949.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.1422954797744751,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 384
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917480653229804e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.1060931899641577,
|
|
"grad_norm": 0.12433395722331816,
|
|
"learning_rate": 1.908096059968869e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 33206078.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 385
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125710510822718e-09,
|
|
"advantages/std": 0.5227956175804138,
|
|
"advantages/var": 0.2733152577612863,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.1089605734767025,
|
|
"grad_norm": 0.1081801352944286,
|
|
"learning_rate": 1.907624289146686e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 33289958.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.14230038225650787,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 386
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5628430692729714e-09,
|
|
"advantages/std": 0.5227974057197571,
|
|
"advantages/var": 0.2733171274273083,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.1118279569892473,
|
|
"grad_norm": 0.09246587460981719,
|
|
"learning_rate": 1.9071513691791077e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 33380170.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.14465448260307312,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 387
|
|
},
|
|
{
|
|
"advantages/mean": 4.6566128730773926e-09,
|
|
"advantages/snr": 7.528248275317514e-09,
|
|
"advantages/std": 0.6185519695281982,
|
|
"advantages/var": 0.3826065390072131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.114695340501792,
|
|
"grad_norm": 0.14999799399924482,
|
|
"learning_rate": 1.9066773006648988e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 33459127.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.1659901738166809,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 388
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 4.599751911938768e-09,
|
|
"advantages/std": 0.40494468808174133,
|
|
"advantages/var": 0.16398020040561878,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 1.1175627240143369,
|
|
"grad_norm": 0.08925593566328756,
|
|
"learning_rate": 1.906202084204279e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 33527677.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 389
|
|
},
|
|
{
|
|
"advantages/mean": 6.984919309616089e-09,
|
|
"advantages/snr": 1.3360585347103394e-08,
|
|
"advantages/std": 0.5228003859519958,
|
|
"advantages/var": 0.2733202435515558,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.1204301075268817,
|
|
"grad_norm": 0.11445198267119233,
|
|
"learning_rate": 1.9057257203989203e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 33619759.0,
|
|
"reward": 0.5859375,
|
|
"reward_std": 0.14913025498390198,
|
|
"rewards/drgrpo_math_reward/mean": 0.5859375,
|
|
"rewards/drgrpo_math_reward/std": 0.49449479579925537,
|
|
"step": 390
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 9.85858603726782e-09,
|
|
"advantages/std": 0.661277174949646,
|
|
"advantages/var": 0.4372875021093847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.1232974910394264,
|
|
"grad_norm": 0.17799224247260523,
|
|
"learning_rate": 1.905248209851949e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 33704395.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.2120065987110138,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 391
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 7.96685234149694e-09,
|
|
"advantages/std": 0.7013981342315674,
|
|
"advantages/var": 0.4919593427035238,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.1261648745519715,
|
|
"grad_norm": 0.16809757067938313,
|
|
"learning_rate": 1.904769553167942e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 33797595.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.24883408844470978,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 392
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983429612088697e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.129032258064516,
|
|
"grad_norm": 0.13776121916921183,
|
|
"learning_rate": 1.9042897509529277e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 33882619.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 393
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.2524227119579474e-09,
|
|
"advantages/std": 0.5726946592330933,
|
|
"advantages/var": 0.3279791727141088,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.131899641577061,
|
|
"grad_norm": 0.148615746974474,
|
|
"learning_rate": 1.9038088038143849e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 33962960.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.17123225331306458,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 394
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814855139419146e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.1347670250896058,
|
|
"grad_norm": 0.09902327906445091,
|
|
"learning_rate": 1.9033267123612417e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 34050097.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.12415501475334167,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 395
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.516894993554525e-09,
|
|
"advantages/std": 0.6185593605041504,
|
|
"advantages/var": 0.3826156824673035,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 1.1376344086021506,
|
|
"grad_norm": 0.15064301550607218,
|
|
"learning_rate": 1.9028434772038762e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 34148015.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.17517907917499542,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 396
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599693686636838e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.1405017921146954,
|
|
"grad_norm": 0.06188828252462216,
|
|
"learning_rate": 1.9023590989541126e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 34226218.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 397
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5726962685585022,
|
|
"advantages/var": 0.3279810160208321,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 1.1433691756272402,
|
|
"grad_norm": 0.12212723663588687,
|
|
"learning_rate": 1.9018735782252242e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 34325697.0,
|
|
"reward": 0.6171875,
|
|
"reward_std": 0.17358636856079102,
|
|
"rewards/drgrpo_math_reward/mean": 0.6171875,
|
|
"rewards/drgrpo_math_reward/std": 0.4879830479621887,
|
|
"step": 398
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299880526045478e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.146236559139785,
|
|
"grad_norm": 0.06080905471047067,
|
|
"learning_rate": 1.9013869156319296e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 34404988.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 399
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 9.29469581896395e-09,
|
|
"advantages/std": 0.7013955116271973,
|
|
"advantages/var": 0.4919556637307778,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.578125,
|
|
"epoch": 1.1491039426523297,
|
|
"grad_norm": 0.15628603367531402,
|
|
"learning_rate": 1.9008991117903937e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 34496070.0,
|
|
"reward": 0.5625,
|
|
"reward_std": 0.2467075139284134,
|
|
"rewards/drgrpo_math_reward/mean": 0.5625,
|
|
"rewards/drgrpo_math_reward/std": 0.49802759289741516,
|
|
"step": 400
|
|
},
|
|
{
|
|
"advantages/mean": 6.05359673500061e-09,
|
|
"advantages/snr": 1.494855892459685e-08,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.1519713261648745,
|
|
"grad_norm": 0.09673900792502516,
|
|
"learning_rate": 1.9004101673182258e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 34585623.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 401
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.609832486150278e-08,
|
|
"advantages/std": 0.40496498346328735,
|
|
"advantages/var": 0.1639966378314206,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.1548387096774193,
|
|
"grad_norm": 0.05818237777324348,
|
|
"learning_rate": 1.8999200828344804e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 34666399.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0936255231499672,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 402
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.131336901697576e-09,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.157706093189964,
|
|
"grad_norm": 0.10039950957745691,
|
|
"learning_rate": 1.8994288589596539e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 34755804.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 403
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.1579550221161443e-08,
|
|
"advantages/std": 0.5227833986282349,
|
|
"advantages/var": 0.2733024818812879,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.1605734767025089,
|
|
"grad_norm": 0.10524671812993915,
|
|
"learning_rate": 1.8989364963156868e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 34841675.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.13098490238189697,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 404
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5726904273033142,
|
|
"advantages/var": 0.3279743255248526,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.1634408602150539,
|
|
"grad_norm": 0.1312953481263664,
|
|
"learning_rate": 1.8984429955259604e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 34919375.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.16781240701675415,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 405
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46761295199394226,
|
|
"advantages/var": 0.21866187287248895,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.1663082437275984,
|
|
"grad_norm": 0.08193295618037837,
|
|
"learning_rate": 1.8979483572152972e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 35000480.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.1246790736913681,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 406
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 9.199522104181912e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 1.1691756272401435,
|
|
"grad_norm": 0.06823624130547844,
|
|
"learning_rate": 1.8974525820099605e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 35100593.0,
|
|
"reward": 0.6015625,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.6015625,
|
|
"rewards/drgrpo_math_reward/std": 0.4915000796318054,
|
|
"step": 407
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.1720430107526882,
|
|
"grad_norm": 0.0809172892703687,
|
|
"learning_rate": 1.8969556705376518e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 35174411.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 408
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.6721682514236524e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 1.174910394265233,
|
|
"grad_norm": 0.15140274321080863,
|
|
"learning_rate": 1.896457623427512e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 35264135.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 409
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917221686896894e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.1777777777777778,
|
|
"grad_norm": 0.11361003928383843,
|
|
"learning_rate": 1.8959584413101206e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 35351533.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 410
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 1.1266523706756892e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.1806451612903226,
|
|
"grad_norm": 0.04996767911029415,
|
|
"learning_rate": 1.8954581248174925e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 35432543.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 411
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.1835125448028674,
|
|
"grad_norm": 0.076903562461185,
|
|
"learning_rate": 1.8949566745830801e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 35510747.0,
|
|
"reward": 0.6171875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.6171875,
|
|
"rewards/drgrpo_math_reward/std": 0.4879830479621887,
|
|
"step": 412
|
|
},
|
|
{
|
|
"advantages/mean": -8.381903171539307e-09,
|
|
"advantages/snr": 1.2675146677787739e-08,
|
|
"advantages/std": 0.661286473274231,
|
|
"advantages/var": 0.4372997997354702,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.1863799283154122,
|
|
"grad_norm": 0.12710704937627615,
|
|
"learning_rate": 1.8944540912417708e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 35598106.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.22567126154899597,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 413
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.0954488684246125e-08,
|
|
"advantages/std": 0.4675959050655365,
|
|
"advantages/var": 0.21864593043405822,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.189247311827957,
|
|
"grad_norm": 0.08600713301720207,
|
|
"learning_rate": 1.8939503754298865e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 35680882.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 414
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983450684521008e-09,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.1921146953405017,
|
|
"grad_norm": 0.0961858939890946,
|
|
"learning_rate": 1.893445527785183e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 35755324.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 415
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.703125,
|
|
"epoch": 1.1949820788530465,
|
|
"grad_norm": 0.060517150584917266,
|
|
"learning_rate": 1.8929395489468494e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 35840539.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 416
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 1.1978494623655913,
|
|
"grad_norm": 0.07215171444096555,
|
|
"learning_rate": 1.8924324395555066e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 35915811.0,
|
|
"reward": 0.59375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.59375,
|
|
"rewards/drgrpo_math_reward/std": 0.4930621087551117,
|
|
"step": 417
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 8.049402934764073e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.2007168458781363,
|
|
"grad_norm": 0.24810926566863736,
|
|
"learning_rate": 1.891924200253207e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 35998688.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 418
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.563002308053355e-09,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.2035842293906809,
|
|
"grad_norm": 0.09293770477831811,
|
|
"learning_rate": 1.8914148316834337e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 36086737.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 419
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199094228701277e-09,
|
|
"advantages/std": 0.4049627184867859,
|
|
"advantages/var": 0.1639948033642078,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 1.206451612903226,
|
|
"grad_norm": 0.0855377069771544,
|
|
"learning_rate": 1.8909043344911e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 36170106.0,
|
|
"reward": 0.5625,
|
|
"reward_std": 0.09127141535282135,
|
|
"rewards/drgrpo_math_reward/mean": 0.5625,
|
|
"rewards/drgrpo_math_reward/std": 0.49802759289741516,
|
|
"step": 420
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344503462080032e-09,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.2093189964157707,
|
|
"grad_norm": 0.08230445158939939,
|
|
"learning_rate": 1.890392709322547e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 36242565.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.12073517590761185,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 421
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 6.504923957817997e-09,
|
|
"advantages/std": 0.5726877450942993,
|
|
"advantages/var": 0.32797125338119315,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.2121863799283155,
|
|
"grad_norm": 0.17186986086914186,
|
|
"learning_rate": 1.889879956825545e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 36330470.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.1633366346359253,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 422
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.2150537634408602,
|
|
"grad_norm": 0.0892643304423356,
|
|
"learning_rate": 1.8893660776492911e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 36401660.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 423
|
|
},
|
|
{
|
|
"advantages/mean": 6.51925802230835e-09,
|
|
"advantages/snr": 1.1383827823242494e-08,
|
|
"advantages/std": 0.5726771354675293,
|
|
"advantages/var": 0.3279591014872949,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 1.217921146953405,
|
|
"grad_norm": 0.11507868659846966,
|
|
"learning_rate": 1.8888510724444092e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 36483608.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.1530819833278656,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 424
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.504949684853452e-09,
|
|
"advantages/std": 0.5726854801177979,
|
|
"advantages/var": 0.32796865913775264,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.2207885304659498,
|
|
"grad_norm": 0.09813999306223548,
|
|
"learning_rate": 1.8883349418629485e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 36576834.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.15992169082164764,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 425
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.90714930920747e-09,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.2236559139784946,
|
|
"grad_norm": 0.10870643147935784,
|
|
"learning_rate": 1.8878176865583831e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 36661457.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 426
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.2265232974910394,
|
|
"grad_norm": 0.07322473260223201,
|
|
"learning_rate": 1.8872993071856112e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 36752775.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 427
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.2293906810035842,
|
|
"grad_norm": 0.0673104946087146,
|
|
"learning_rate": 1.8867798044009546e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 36826486.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 428
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 1.232258064516129,
|
|
"grad_norm": 0.060029529820155615,
|
|
"learning_rate": 1.886259178862157e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 36920849.0,
|
|
"reward": 0.6171875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.6171875,
|
|
"rewards/drgrpo_math_reward/std": 0.4879830479621887,
|
|
"step": 429
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.379866977655094e-08,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.2351254480286737,
|
|
"grad_norm": 0.12504837454470752,
|
|
"learning_rate": 1.8857374312283835e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 36989818.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 430
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.9753001796492024e-09,
|
|
"advantages/std": 0.4675861597061157,
|
|
"advantages/var": 0.21863681674871316,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.2379928315412188,
|
|
"grad_norm": 0.06734258077817594,
|
|
"learning_rate": 1.8852145621602204e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 37070613.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 431
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 7.318093530282481e-09,
|
|
"advantages/std": 0.5726835131645203,
|
|
"advantages/var": 0.32796640625045725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.2408602150537635,
|
|
"grad_norm": 0.10759439920719689,
|
|
"learning_rate": 1.8846905723196732e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 37151115.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 432
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.562795544129766e-09,
|
|
"advantages/std": 0.5228043794631958,
|
|
"advantages/var": 0.2733244191858972,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.2437275985663083,
|
|
"grad_norm": 0.12803270673421868,
|
|
"learning_rate": 1.8841654623701671e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 37239984.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.1514892876148224,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 433
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.131306434342512e-10,
|
|
"advantages/std": 0.5726770758628845,
|
|
"advantages/var": 0.327959033218864,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.246594982078853,
|
|
"grad_norm": 0.11317202504210273,
|
|
"learning_rate": 1.8836392329765448e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 37321224.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.1530819833278656,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 434
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.2525191884915387e-09,
|
|
"advantages/std": 0.5726776719093323,
|
|
"advantages/var": 0.3279597159034928,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.249462365591398,
|
|
"grad_norm": 0.15134775700601374,
|
|
"learning_rate": 1.8831118848050666e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 37390443.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.154142826795578,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 435
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.562923093105361e-09,
|
|
"advantages/std": 0.5227856636047363,
|
|
"advantages/var": 0.27330485007064453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.2523297491039427,
|
|
"grad_norm": 0.09403764676074379,
|
|
"learning_rate": 1.8825834185234098e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 37476406.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.13098981976509094,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 436
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.225143719438781e-09,
|
|
"advantages/std": 0.6612716317176819,
|
|
"advantages/var": 0.4372801709145655,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.2551971326164875,
|
|
"grad_norm": 0.11909031388211558,
|
|
"learning_rate": 1.8820538348006666e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 37572061.0,
|
|
"reward": 0.609375,
|
|
"reward_std": 0.20517179369926453,
|
|
"rewards/drgrpo_math_reward/mean": 0.609375,
|
|
"rewards/drgrpo_math_reward/std": 0.4898075461387634,
|
|
"step": 437
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.149905988405531e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 1.2580645161290323,
|
|
"grad_norm": 0.09797707756153493,
|
|
"learning_rate": 1.8815231343073444e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 37660072.0,
|
|
"reward": 0.5703125,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.5703125,
|
|
"rewards/drgrpo_math_reward/std": 0.4969765841960907,
|
|
"step": 438
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.5167326511663e-09,
|
|
"advantages/std": 0.6185815930366516,
|
|
"advantages/var": 0.3826431872437617,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.260931899641577,
|
|
"grad_norm": 0.11722912957013189,
|
|
"learning_rate": 1.880991317715364e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 37753098.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.20357418060302734,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 439
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.504965933612274e-09,
|
|
"advantages/std": 0.5726840496063232,
|
|
"advantages/var": 0.3279670206734977,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.2637992831541218,
|
|
"grad_norm": 0.13012720565928393,
|
|
"learning_rate": 1.8804583856980603e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 37828784.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.16097761690616608,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 440
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 8.45014868788346e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.2666666666666666,
|
|
"grad_norm": 0.06212992035853581,
|
|
"learning_rate": 1.8799243389301796e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 37897771.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 441
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814751550759118e-09,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 1.2695340501792114,
|
|
"grad_norm": 0.06849675578675044,
|
|
"learning_rate": 1.8793891780878798e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 37972188.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 442
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.562891001898203e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 1.2724014336917562,
|
|
"grad_norm": 0.07792881497712156,
|
|
"learning_rate": 1.8788529038487296e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 38064377.0,
|
|
"reward": 0.59375,
|
|
"reward_std": 0.13781969249248505,
|
|
"rewards/drgrpo_math_reward/mean": 0.59375,
|
|
"rewards/drgrpo_math_reward/std": 0.4930621087551117,
|
|
"step": 443
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5629340611188405e-09,
|
|
"advantages/std": 0.5227840542793274,
|
|
"advantages/var": 0.27330316740873073,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.2752688172043012,
|
|
"grad_norm": 0.10833904988467241,
|
|
"learning_rate": 1.8783155168917068e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 38145860.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.13204573094844818,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 444
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.494855892459685e-08,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.2781362007168457,
|
|
"grad_norm": 0.07476540605174994,
|
|
"learning_rate": 1.8777770178971987e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 38227493.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 445
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 4.516761811003565e-09,
|
|
"advantages/std": 0.6185775995254517,
|
|
"advantages/var": 0.38263824663467005,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.2810035842293908,
|
|
"grad_norm": 0.1370815896913483,
|
|
"learning_rate": 1.8772374075470006e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 38324888.0,
|
|
"reward": 0.5390625,
|
|
"reward_std": 0.19674429297447205,
|
|
"rewards/drgrpo_math_reward/mean": 0.5390625,
|
|
"rewards/drgrpo_math_reward/std": 0.5004304051399231,
|
|
"step": 446
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.781509278854418e-09,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.2838709677419355,
|
|
"grad_norm": 0.0993472149618786,
|
|
"learning_rate": 1.8766966865243136e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 38410186.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 447
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.131154947272497e-09,
|
|
"advantages/std": 0.5726877450942993,
|
|
"advantages/var": 0.32797125338119315,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.2867383512544803,
|
|
"grad_norm": 0.1029740923633324,
|
|
"learning_rate": 1.8761548555137466e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 38498004.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.1633366346359253,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 448
|
|
},
|
|
{
|
|
"advantages/mean": 5.587935447692871e-09,
|
|
"advantages/snr": 1.1950401561551717e-08,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.2896057347670251,
|
|
"grad_norm": 0.09605681530603226,
|
|
"learning_rate": 1.875611915201313e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 38577923.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 449
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.970882307601416e-09,
|
|
"advantages/std": 0.4676063656806946,
|
|
"advantages/var": 0.21865571322510746,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.29247311827957,
|
|
"grad_norm": 0.09269184158527434,
|
|
"learning_rate": 1.8750678662744308e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 38658214.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.11913755536079407,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 450
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.1383531621724775e-08,
|
|
"advantages/std": 0.5726920366287231,
|
|
"advantages/var": 0.32797616881795477,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.71875,
|
|
"epoch": 1.2953405017921147,
|
|
"grad_norm": 0.12109369035741321,
|
|
"learning_rate": 1.8745227094219218e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 38745225.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.1701665222644806,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 451
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125678014490734e-09,
|
|
"advantages/std": 0.5227980017662048,
|
|
"advantages/var": 0.2733177506507367,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.2982078853046595,
|
|
"grad_norm": 0.08139897608330982,
|
|
"learning_rate": 1.8739764453340107e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 38829061.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.14571532607078552,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 452
|
|
},
|
|
{
|
|
"advantages/mean": 5.587935447692871e-09,
|
|
"advantages/snr": 1.379887186086637e-08,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.3010752688172043,
|
|
"grad_norm": 0.08981833431419232,
|
|
"learning_rate": 1.8734290747023237e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 38908410.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 453
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016504754270957e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.303942652329749,
|
|
"grad_norm": 0.12137991283167442,
|
|
"learning_rate": 1.8728805982198877e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 38993045.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.13781969249248505,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 454
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 8.944303023925922e-09,
|
|
"advantages/std": 0.5726856589317322,
|
|
"advantages/var": 0.3279688639460723,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.3068100358422938,
|
|
"grad_norm": 0.11385847015754162,
|
|
"learning_rate": 1.8723310165811308e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 39074152.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.16333173215389252,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 455
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.1498803472501986e-09,
|
|
"advantages/std": 0.40496498346328735,
|
|
"advantages/var": 0.1639966378314206,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.3096774193548386,
|
|
"grad_norm": 0.08996537497957356,
|
|
"learning_rate": 1.8717803304818794e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 39162360.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0936255231499672,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 456
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966610933966542e-09,
|
|
"advantages/std": 0.4676129221916199,
|
|
"advantages/var": 0.21866184500058594,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 1.3125448028673836,
|
|
"grad_norm": 0.08467746260660809,
|
|
"learning_rate": 1.8712285406193585e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 39247316.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.1246790662407875,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 457
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.3278566809822465e-09,
|
|
"advantages/std": 0.7013728022575378,
|
|
"advantages/var": 0.4919238077465913,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.3154121863799282,
|
|
"grad_norm": 0.14786741187405614,
|
|
"learning_rate": 1.8706756476921907e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 39335135.0,
|
|
"reward": 0.6015625,
|
|
"reward_std": 0.21254336833953857,
|
|
"rewards/drgrpo_math_reward/mean": 0.6015625,
|
|
"rewards/drgrpo_math_reward/std": 0.4915000796318054,
|
|
"step": 458
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.562869878887461e-09,
|
|
"advantages/std": 0.5227934718132019,
|
|
"advantages/var": 0.27331301417050113,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.3182795698924732,
|
|
"grad_norm": 0.11689445554737352,
|
|
"learning_rate": 1.8701216524003953e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 39424066.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.1422954797744751,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 459
|
|
},
|
|
{
|
|
"advantages/mean": 7.450580596923828e-09,
|
|
"advantages/snr": 1.2044824669190209e-08,
|
|
"advantages/std": 0.6185711026191711,
|
|
"advantages/var": 0.38263020899549716,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 1.321146953405018,
|
|
"grad_norm": 0.09807668103246633,
|
|
"learning_rate": 1.8695665554453868e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 39508401.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.18884865939617157,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 460
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.4536934733273465e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.3240143369175628,
|
|
"grad_norm": 0.06544111566552296,
|
|
"learning_rate": 1.8690103575299752e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 39593601.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.12756997346878052,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 461
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.2196958382045529e-08,
|
|
"advantages/std": 0.5726771354675293,
|
|
"advantages/var": 0.3279591014872949,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.3268817204301075,
|
|
"grad_norm": 0.12305107284888928,
|
|
"learning_rate": 1.8684530593583636e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 39670753.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.1530819982290268,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 462
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.3444254652277355e-09,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.3297491039426523,
|
|
"grad_norm": 0.12313119220773869,
|
|
"learning_rate": 1.867894661636149e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 39757110.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 463
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.572691023349762,
|
|
"advantages/var": 0.3279750082253976,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.3326164874551971,
|
|
"grad_norm": 0.11870701325721736,
|
|
"learning_rate": 1.8673351650703201e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 39837325.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.16887323558330536,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 464
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.335483870967742,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.866774570369257e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 39916341.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 465
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.6185612082481384,
|
|
"advantages/var": 0.3826179683493969,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.3383512544802867,
|
|
"grad_norm": 0.1028369035844959,
|
|
"learning_rate": 1.8662128782427297e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 40007964.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.1751839816570282,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 466
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.2524501310221626e-09,
|
|
"advantages/std": 0.5726898312568665,
|
|
"advantages/var": 0.3279736428250182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.3412186379928315,
|
|
"grad_norm": 0.14570376248276248,
|
|
"learning_rate": 1.8656500894018986e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 40101423.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.16675159335136414,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 467
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917221686896894e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 1.3440860215053765,
|
|
"grad_norm": 0.14877932191107487,
|
|
"learning_rate": 1.8650862045593114e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 40184237.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 468
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.1950220288145723e-08,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.346953405017921,
|
|
"grad_norm": 0.08974549401489106,
|
|
"learning_rate": 1.8645212244289047e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 40263229.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 469
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 4.599556592183647e-09,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.349820788530466,
|
|
"grad_norm": 0.08507901581401366,
|
|
"learning_rate": 1.8639551497260007e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 40349793.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 470
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.979341600636731e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 1.3526881720430106,
|
|
"grad_norm": 0.08376628223431468,
|
|
"learning_rate": 1.8633879811673086e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 40424175.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 471
|
|
},
|
|
{
|
|
"advantages/mean": -7.916241884231567e-09,
|
|
"advantages/snr": 1.1286513223603052e-08,
|
|
"advantages/std": 0.701389491558075,
|
|
"advantages/var": 0.4919472188680949,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 1.3555555555555556,
|
|
"grad_norm": 0.1185452237370237,
|
|
"learning_rate": 1.8628197194709213e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 40514516.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.23857945203781128,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 472
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.3584229390681004,
|
|
"grad_norm": 0.09299044822566034,
|
|
"learning_rate": 1.862250365356317e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 40605730.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 473
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5726904273033142,
|
|
"advantages/var": 0.3279743255248526,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 1.3612903225806452,
|
|
"grad_norm": 0.1015467554994938,
|
|
"learning_rate": 1.8616799195443563e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 40690416.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.16781240701675415,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 474
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.8788021410185465e-09,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 1.36415770609319,
|
|
"grad_norm": 0.12127591444840169,
|
|
"learning_rate": 1.8611083827572815e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 40782303.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 475
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.504917864602565e-09,
|
|
"advantages/std": 0.5726882815361023,
|
|
"advantages/var": 0.32797186780877396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.3670250896057348,
|
|
"grad_norm": 0.11257977885529864,
|
|
"learning_rate": 1.8605357557187172e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 40868256.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.1643974632024765,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 476
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.3698924731182796,
|
|
"grad_norm": 0.09588521532557784,
|
|
"learning_rate": 1.859962039153668e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 40950899.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 477
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 4.878747807970186e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.3727598566308243,
|
|
"grad_norm": 0.10874589141704576,
|
|
"learning_rate": 1.8593872337885175e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 41034717.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 478
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899641578136434e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.3756272401433691,
|
|
"grad_norm": 0.07697473222804034,
|
|
"learning_rate": 1.8588113403510286e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 41118625.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 479
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 1.378494623655914,
|
|
"grad_norm": 0.10762834890644403,
|
|
"learning_rate": 1.8582343595703414e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 41200803.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 480
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 1.381362007168459,
|
|
"grad_norm": 0.0867693183099241,
|
|
"learning_rate": 1.8576562921769726e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 41286801.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 481
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.131297971228846e-09,
|
|
"advantages/std": 0.5726776719093323,
|
|
"advantages/var": 0.3279597159034928,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.3842293906810035,
|
|
"grad_norm": 0.15292227824201848,
|
|
"learning_rate": 1.8570771389028148e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 41371319.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.154142826795578,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 482
|
|
},
|
|
{
|
|
"advantages/mean": -1.0244548320770264e-08,
|
|
"advantages/snr": 1.5491760762766434e-08,
|
|
"advantages/std": 0.6612901091575623,
|
|
"advantages/var": 0.4373046084696206,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.3870967741935485,
|
|
"grad_norm": 0.14366149758916022,
|
|
"learning_rate": 1.8564969004811354e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 41460754.0,
|
|
"reward": 0.609375,
|
|
"reward_std": 0.2290911078453064,
|
|
"rewards/drgrpo_math_reward/mean": 0.609375,
|
|
"rewards/drgrpo_math_reward/std": 0.4898075461387634,
|
|
"step": 483
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299807237755752e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.3899641577060933,
|
|
"grad_norm": 0.07126110064477205,
|
|
"learning_rate": 1.8559155776465756e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 41545261.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 484
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.971194650918909e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.392831541218638,
|
|
"grad_norm": 0.11155526099221504,
|
|
"learning_rate": 1.8553331711351498e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 41628232.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.09863808006048203,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 485
|
|
},
|
|
{
|
|
"advantages/mean": 4.6566128730773926e-09,
|
|
"advantages/snr": 8.907056898068929e-09,
|
|
"advantages/std": 0.5228003859519958,
|
|
"advantages/var": 0.2733202435515558,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.3956989247311828,
|
|
"grad_norm": 0.13006837864211654,
|
|
"learning_rate": 1.8547496816842446e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 41710869.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.14913025498390198,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 486
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.75753319179351e-09,
|
|
"advantages/std": 0.5726791024208069,
|
|
"advantages/var": 0.327961354349501,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.3985663082437276,
|
|
"grad_norm": 0.1012756121633877,
|
|
"learning_rate": 1.8541651100326172e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 41799936.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.15308690071105957,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 487
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262475767256781e-09,
|
|
"advantages/std": 0.5726819038391113,
|
|
"advantages/var": 0.32796456298478915,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.4014336917562724,
|
|
"grad_norm": 0.13748119752207785,
|
|
"learning_rate": 1.853579456920395e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 41878924.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.15756267309188843,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 488
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983473534398352e-09,
|
|
"advantages/std": 0.4675931930541992,
|
|
"advantages/var": 0.21864339419062162,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.4043010752688172,
|
|
"grad_norm": 0.10745931360439237,
|
|
"learning_rate": 1.8529927230890755e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 41951782.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 489
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 1.9917227329728737e-09,
|
|
"advantages/std": 0.7013947367668152,
|
|
"advantages/var": 0.49195457676418997,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.407168458781362,
|
|
"grad_norm": 0.15995655732792285,
|
|
"learning_rate": 1.8524049092815236e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 42036663.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.24541422724723816,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 490
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 5.691702426092771e-09,
|
|
"advantages/std": 0.5726984143257141,
|
|
"advantages/var": 0.3279834737711873,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.4100358422939068,
|
|
"grad_norm": 0.11393319820939447,
|
|
"learning_rate": 1.8518160162419718e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 42120967.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.17700131237506866,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 491
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.4083418875242214e-09,
|
|
"advantages/std": 0.6612901091575623,
|
|
"advantages/var": 0.4373046084696206,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.4129032258064516,
|
|
"grad_norm": 0.1200229835178394,
|
|
"learning_rate": 1.8512260447160187e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 42210381.0,
|
|
"reward": 0.5625,
|
|
"reward_std": 0.2290911078453064,
|
|
"rewards/drgrpo_math_reward/mean": 0.5625,
|
|
"rewards/drgrpo_math_reward/std": 0.49802759289741516,
|
|
"step": 492
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 8.280895157741607e-09,
|
|
"advantages/std": 0.6185652613639832,
|
|
"advantages/var": 0.3826229825662928,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.4157706093189963,
|
|
"grad_norm": 0.13720668328810662,
|
|
"learning_rate": 1.8506349954506297e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 42291297.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.1820138841867447,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 493
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 7.527983501262913e-10,
|
|
"advantages/std": 0.6185737252235413,
|
|
"advantages/var": 0.3826334535369291,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 1.4186379928315414,
|
|
"grad_norm": 0.13930169565034922,
|
|
"learning_rate": 1.8500428691941328e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 42381129.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.19332444667816162,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 494
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.421505376344086,
|
|
"grad_norm": 0.10696979494855548,
|
|
"learning_rate": 1.8494496666962206e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 42465657.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 495
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983495876754114e-09,
|
|
"advantages/std": 0.4675905704498291,
|
|
"advantages/var": 0.2186409415735966,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.424372759856631,
|
|
"grad_norm": 0.07145280671923458,
|
|
"learning_rate": 1.848855388707949e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 42547630.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.10205792635679245,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 496
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 6.234939117800795e-09,
|
|
"advantages/std": 0.5228004455566406,
|
|
"advantages/var": 0.27332030587422196,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.4272401433691757,
|
|
"grad_norm": 0.09676791260047765,
|
|
"learning_rate": 1.8482600359817342e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 42627615.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.14913025498390198,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 497
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.0112444683242622e-09,
|
|
"advantages/std": 0.6185632348060608,
|
|
"advantages/var": 0.3826204754537379,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.4301075268817205,
|
|
"grad_norm": 0.14074182529916074,
|
|
"learning_rate": 1.847663609271354e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 42719557.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.17859892547130585,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 498
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975300560494157e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.4329749103942653,
|
|
"grad_norm": 0.10017823898476097,
|
|
"learning_rate": 1.847066109331946e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 42800302.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 499
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.6185612082481384,
|
|
"advantages/var": 0.3826179683493969,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.43584229390681,
|
|
"grad_norm": 0.1179838232391151,
|
|
"learning_rate": 1.8464675369200057e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 42888803.0,
|
|
"reward": 0.609375,
|
|
"reward_std": 0.1751839816570282,
|
|
"rewards/drgrpo_math_reward/mean": 0.609375,
|
|
"rewards/drgrpo_math_reward/std": 0.4898075461387634,
|
|
"step": 500
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.5629027821475993e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.4387096774193548,
|
|
"grad_norm": 0.09646195424819327,
|
|
"learning_rate": 1.8458678927933882e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 42979611.0,
|
|
"reward": 0.609375,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.609375,
|
|
"rewards/drgrpo_math_reward/std": 0.4898075461387634,
|
|
"step": 501
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.4415770609318996,
|
|
"grad_norm": 0.05897016915946197,
|
|
"learning_rate": 1.8452671777113033e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 43051799.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 502
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599658819865184e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.4444444444444444,
|
|
"grad_norm": 0.09539664660420902,
|
|
"learning_rate": 1.8446653924343188e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 43130374.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 503
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.8994882297977766e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.4473118279569892,
|
|
"grad_norm": 0.09780941279649633,
|
|
"learning_rate": 1.8440625377243557e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 43211497.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 504
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.991766726549734e-09,
|
|
"advantages/std": 0.4675861597061157,
|
|
"advantages/var": 0.21863681674871316,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.450179211469534,
|
|
"grad_norm": 0.11666079982718623,
|
|
"learning_rate": 1.8434586143446905e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 43289312.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 505
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.4530465949820788,
|
|
"grad_norm": 0.049559262838082686,
|
|
"learning_rate": 1.842853623059952e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 43367438.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 506
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.4559139784946238,
|
|
"grad_norm": 0.05497588612217425,
|
|
"learning_rate": 1.8422475646361208e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 43440860.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 507
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 8.049325332145131e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 1.4587813620071683,
|
|
"grad_norm": 0.06195988661538175,
|
|
"learning_rate": 1.8416404398405296e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 43533713.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 508
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983628920516591e-09,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.4616487455197134,
|
|
"grad_norm": 0.1537818005926993,
|
|
"learning_rate": 1.8410322494418603e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 43604263.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 509
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.4645161290322581,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.8404229942101442e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 43679233.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 510
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958683201273463e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.467383512544803,
|
|
"grad_norm": 0.16557840487058936,
|
|
"learning_rate": 1.8398126749167613e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 43759594.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 511
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.0111388527025745e-09,
|
|
"advantages/std": 0.618584930896759,
|
|
"advantages/var": 0.38264731673254815,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.4702508960573477,
|
|
"grad_norm": 0.11408788231286815,
|
|
"learning_rate": 1.8392012923344378e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 43852880.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.20593319833278656,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 512
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 9.154460730631332e-09,
|
|
"advantages/std": 0.6612728834152222,
|
|
"advantages/var": 0.437281826340282,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.4731182795698925,
|
|
"grad_norm": 0.21363207577931034,
|
|
"learning_rate": 1.838588847237247e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 43924952.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.2041158676147461,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 513
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917813257124117e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.4759856630824373,
|
|
"grad_norm": 0.10296469008522874,
|
|
"learning_rate": 1.8379753404006073e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 44006988.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 514
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.2583373839143607e-09,
|
|
"advantages/std": 0.6185895204544067,
|
|
"advantages/var": 0.3826529948160129,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.478853046594982,
|
|
"grad_norm": 0.11876060878737717,
|
|
"learning_rate": 1.8373607726012811e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 44084728.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.21382391452789307,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 515
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.034946312888557e-08,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.4817204301075269,
|
|
"grad_norm": 0.08411429785495499,
|
|
"learning_rate": 1.8367451446173746e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 44165115.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 516
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.4845878136200716,
|
|
"grad_norm": 0.05578709283562855,
|
|
"learning_rate": 1.8361284572283354e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 44246006.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 517
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016755193120049e-09,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.4874551971326164,
|
|
"grad_norm": 0.11364701408577403,
|
|
"learning_rate": 1.835510711214953e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 44322867.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 518
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.562954778661877e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 1.4903225806451612,
|
|
"grad_norm": 0.10914034535598677,
|
|
"learning_rate": 1.8348919073593575e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 44414575.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.12756997346878052,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 519
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 8.450170142019348e-09,
|
|
"advantages/std": 0.6612808108329773,
|
|
"advantages/var": 0.4372923107759199,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 1.4931899641577062,
|
|
"grad_norm": 0.12566288606330955,
|
|
"learning_rate": 1.834272046445018e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 44516535.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.21542643010616302,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 520
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 6.50497202691776e-09,
|
|
"advantages/std": 0.5726835131645203,
|
|
"advantages/var": 0.32796640625045725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.4960573476702508,
|
|
"grad_norm": 0.08963705937595029,
|
|
"learning_rate": 1.8336511292567418e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 44596646.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 521
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.1383871662376608e-08,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.4989247311827958,
|
|
"grad_norm": 0.12113829535017172,
|
|
"learning_rate": 1.8330291565806734e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 44673357.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 522
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299846843318419e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.5017921146953404,
|
|
"grad_norm": 0.07934921760227831,
|
|
"learning_rate": 1.832406129204295e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 44764356.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 523
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.022546389467422e-09,
|
|
"advantages/std": 0.618557333946228,
|
|
"advantages/var": 0.38261317537866546,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 1.5046594982078854,
|
|
"grad_norm": 0.13005206333052863,
|
|
"learning_rate": 1.8317820479164219e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 44856882.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.17176413536071777,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 524
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.987538125611118e-09,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.5075268817204301,
|
|
"grad_norm": 0.08623005065735916,
|
|
"learning_rate": 1.8311569135072059e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 44937742.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 525
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.75738695226396e-09,
|
|
"advantages/std": 0.5726876854896545,
|
|
"advantages/var": 0.3279711851114975,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.510394265232975,
|
|
"grad_norm": 0.1094564783292251,
|
|
"learning_rate": 1.8305307267681305e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 45017324.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.1633366346359253,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 526
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 3.4497179652165926e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.5132616487455197,
|
|
"grad_norm": 0.1083477319515736,
|
|
"learning_rate": 1.8299034884920128e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 45088340.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 527
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.453569069264198e-09,
|
|
"advantages/std": 0.5227956175804138,
|
|
"advantages/var": 0.2733152577612863,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.5161290322580645,
|
|
"grad_norm": 0.10553993076350929,
|
|
"learning_rate": 1.829275199473001e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 45174127.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.14230038225650787,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 528
|
|
},
|
|
{
|
|
"advantages/mean": 5.122274160385132e-09,
|
|
"advantages/snr": 1.0954471927793293e-08,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.5189964157706093,
|
|
"grad_norm": 0.07736508690404215,
|
|
"learning_rate": 1.8286458605065728e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 45258212.0,
|
|
"reward": 0.6171875,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.6171875,
|
|
"rewards/drgrpo_math_reward/std": 0.4879830479621887,
|
|
"step": 529
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.907216333870301e-10,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.521863799283154,
|
|
"grad_norm": 0.12359899648958964,
|
|
"learning_rate": 1.828015472389536e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 45342083.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 530
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.011196882346898e-09,
|
|
"advantages/std": 0.618573009967804,
|
|
"advantages/var": 0.3826325686606289,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.524731182795699,
|
|
"grad_norm": 0.1403189014289581,
|
|
"learning_rate": 1.827384035920027e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 45426887.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.18885356187820435,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 531
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.504949684853452e-09,
|
|
"advantages/std": 0.5726854801177979,
|
|
"advantages/var": 0.32796865913775264,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 1.5275985663082436,
|
|
"grad_norm": 0.1282566540666401,
|
|
"learning_rate": 1.8267515518975086e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 45516431.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.15992169082164764,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 532
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.9752099207640785e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.5304659498207887,
|
|
"grad_norm": 0.12702320164205028,
|
|
"learning_rate": 1.8261180211227707e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 45580640.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 533
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.6721527138226332e-09,
|
|
"advantages/std": 0.5227934122085571,
|
|
"advantages/var": 0.27331295184866633,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.5333333333333332,
|
|
"grad_norm": 0.18715919069910997,
|
|
"learning_rate": 1.825483444397928e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 45663240.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.1422954797744751,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 534
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.2525831708228704e-09,
|
|
"advantages/std": 0.5726664066314697,
|
|
"advantages/var": 0.32794681328419983,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 1.5362007168458782,
|
|
"grad_norm": 0.16291162730230088,
|
|
"learning_rate": 1.8248478225264199e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 45741546.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.13941730558872223,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 535
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.5390681003584228,
|
|
"grad_norm": 0.13444607915853476,
|
|
"learning_rate": 1.8242111563130088e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 45818648.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 536
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 6.970911630250105e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 1.5419354838709678,
|
|
"grad_norm": 0.08328740955067904,
|
|
"learning_rate": 1.8235734465637792e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 45903903.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 537
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.96694656101877e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.5448028673835126,
|
|
"grad_norm": 0.11540333279554736,
|
|
"learning_rate": 1.8229346940861373e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 45981684.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 538
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.065607939970255e-09,
|
|
"advantages/std": 0.5726834535598755,
|
|
"advantages/var": 0.32796633798126607,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.5476702508960574,
|
|
"grad_norm": 0.1008218760985725,
|
|
"learning_rate": 1.822294899688809e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 46068728.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 539
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.878579738631987e-09,
|
|
"advantages/std": 0.5727010369300842,
|
|
"advantages/var": 0.3279864777007937,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 1.5505376344086022,
|
|
"grad_norm": 0.11497320202059863,
|
|
"learning_rate": 1.8216540641818399e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 46162825.0,
|
|
"reward": 0.609375,
|
|
"reward_std": 0.17806705832481384,
|
|
"rewards/drgrpo_math_reward/mean": 0.609375,
|
|
"rewards/drgrpo_math_reward/std": 0.4898075461387634,
|
|
"step": 540
|
|
},
|
|
{
|
|
"advantages/mean": -8.381903171539307e-09,
|
|
"advantages/snr": 1.3550341577173055e-08,
|
|
"advantages/std": 0.6185750365257263,
|
|
"advantages/var": 0.38263507581280365,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.553405017921147,
|
|
"grad_norm": 0.1890697733491184,
|
|
"learning_rate": 1.821012188376593e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 46255325.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.1922685205936432,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 541
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.5562724014336917,
|
|
"grad_norm": 0.09357868073770031,
|
|
"learning_rate": 1.8203692730857492e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 46332842.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 542
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 8.907505770133387e-10,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 1.5591397849462365,
|
|
"grad_norm": 0.10648952831676473,
|
|
"learning_rate": 1.819725319123305e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 46418299.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 543
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.8996420859237135e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.5620071684587815,
|
|
"grad_norm": 0.13346921631545694,
|
|
"learning_rate": 1.8190803273045723e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 46495675.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 544
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.564874551971326,
|
|
"grad_norm": 0.13632501005664457,
|
|
"learning_rate": 1.8184342984461764e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 46580431.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 545
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983450684521008e-09,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.567741935483871,
|
|
"grad_norm": 0.16067957718336778,
|
|
"learning_rate": 1.8177872333660569e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 46665618.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 546
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.4082993428404723e-08,
|
|
"advantages/std": 0.330655038356781,
|
|
"advantages/var": 0.10933275439072432,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.5706093189964156,
|
|
"grad_norm": 0.08184278908307872,
|
|
"learning_rate": 1.8171391328834638e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 46744346.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 547
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975343977137235e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.5734767025089607,
|
|
"grad_norm": 0.21545240402425084,
|
|
"learning_rate": 1.8164899978189592e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 46836186.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.09522313624620438,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 548
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899641578136434e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 1.5763440860215052,
|
|
"grad_norm": 0.08796547171831597,
|
|
"learning_rate": 1.8158398289944142e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 46926622.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 549
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 1.5792114695340502,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.8151886272330094e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 47016908.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 550
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227856636047363,
|
|
"advantages/var": 0.27330485007064453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.582078853046595,
|
|
"grad_norm": 0.0932480653471435,
|
|
"learning_rate": 1.8145363933592334e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 47112852.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.13098981976509094,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 551
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.065538545096158e-09,
|
|
"advantages/std": 0.5726932287216187,
|
|
"advantages/var": 0.3279775342235922,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.5849462365591398,
|
|
"grad_norm": 0.09557245277745192,
|
|
"learning_rate": 1.8138831281988805e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 47204638.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.172288179397583,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 552
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975144418133046e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 1.5878136200716846,
|
|
"grad_norm": 0.14325604527216604,
|
|
"learning_rate": 1.8132288325790515e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 47286370.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 553
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.9752099207640785e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.5906810035842294,
|
|
"grad_norm": 0.1400010191680866,
|
|
"learning_rate": 1.8125735073281522e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 47371538.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 554
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.25248601345888e-09,
|
|
"advantages/std": 0.5726835131645203,
|
|
"advantages/var": 0.32796640625045725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.5935483870967742,
|
|
"grad_norm": 0.13880001238703485,
|
|
"learning_rate": 1.8119171532758907e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 47461498.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 555
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.596415770609319,
|
|
"grad_norm": 0.06593143826340761,
|
|
"learning_rate": 1.8112597712532796e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 47527028.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 556
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 4.929320121387901e-09,
|
|
"advantages/std": 0.6612735390663147,
|
|
"advantages/var": 0.43728269346928883,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.599283154121864,
|
|
"grad_norm": 0.24531721008997973,
|
|
"learning_rate": 1.810601362092631e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 47606091.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.20858673751354218,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 557
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.0689055673126508e-08,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.6021505376344085,
|
|
"grad_norm": 0.14318215339362345,
|
|
"learning_rate": 1.809941926627559e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 47678580.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 558
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 7.528039356157216e-09,
|
|
"advantages/std": 0.6185691356658936,
|
|
"advantages/var": 0.3826277755984506,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.6050179211469535,
|
|
"grad_norm": 0.09542996872827302,
|
|
"learning_rate": 1.8092814656929758e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 47763659.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.18543371558189392,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 559
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125678014490734e-09,
|
|
"advantages/std": 0.5227980017662048,
|
|
"advantages/var": 0.2733177506507367,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.607885304659498,
|
|
"grad_norm": 0.10388799322074858,
|
|
"learning_rate": 1.8086199801250934e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 47851559.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.14571532607078552,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 560
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 1.610752688172043,
|
|
"grad_norm": 0.19137629780953486,
|
|
"learning_rate": 1.80795747076142e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 47927931.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 561
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 6.77521322356945e-09,
|
|
"advantages/std": 0.6185711622238159,
|
|
"advantages/var": 0.3826302827349224,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.6136200716845877,
|
|
"grad_norm": 0.16267964778963664,
|
|
"learning_rate": 1.8072939384407607e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 48011635.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.18884865939617157,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 562
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.01121313104812e-09,
|
|
"advantages/std": 0.6185696721076965,
|
|
"advantages/var": 0.3826284392514232,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.6164874551971327,
|
|
"grad_norm": 0.14616920487924576,
|
|
"learning_rate": 1.8066293840032146e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 48097916.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.18649455904960632,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 563
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 8.131336901697577e-10,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.6193548387096774,
|
|
"grad_norm": 0.11525279452798919,
|
|
"learning_rate": 1.8059638082901765e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 48193148.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 564
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.6222222222222222,
|
|
"grad_norm": 0.07025099475920485,
|
|
"learning_rate": 1.8052972121443335e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 48266406.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 565
|
|
},
|
|
{
|
|
"advantages/mean": -8.381903171539307e-09,
|
|
"advantages/snr": 1.3550515235317965e-08,
|
|
"advantages/std": 0.6185671091079712,
|
|
"advantages/var": 0.38262526847019274,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.625089605734767,
|
|
"grad_norm": 0.14157810037874768,
|
|
"learning_rate": 1.8046295964096641e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 48352229.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.18201877176761627,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 566
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.6185632348060608,
|
|
"advantages/var": 0.3826204754537379,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 1.6279569892473118,
|
|
"grad_norm": 0.1390230216267632,
|
|
"learning_rate": 1.8039609619314389e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 48443657.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.17859894037246704,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 567
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.3444321679928155e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.6308243727598566,
|
|
"grad_norm": 0.1369742479768933,
|
|
"learning_rate": 1.8032913095562172e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 48532110.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.12756997346878052,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 568
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 2.816760194592258e-09,
|
|
"advantages/std": 0.6612721681594849,
|
|
"advantages/var": 0.437280880382346,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.6336917562724014,
|
|
"grad_norm": 0.16966932036270904,
|
|
"learning_rate": 1.802620640131848e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 48620955.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.20623260736465454,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 569
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 5.269610284998461e-09,
|
|
"advantages/std": 0.6185711622238159,
|
|
"advantages/var": 0.3826302827349224,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.6365591397849464,
|
|
"grad_norm": 0.1201180445347826,
|
|
"learning_rate": 1.8019489545074672e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 48712311.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.18884865939617157,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 570
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.131246346616979e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.639426523297491,
|
|
"grad_norm": 0.15013529882265428,
|
|
"learning_rate": 1.8012762535334975e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 48811971.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.15650182962417603,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 571
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125942055767658e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.642293906810036,
|
|
"grad_norm": 0.11477054989531618,
|
|
"learning_rate": 1.8006025380616478e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 48896368.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.12415501475334167,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 572
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.3941923669503344e-08,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.6451612903225805,
|
|
"grad_norm": 0.11789349960518178,
|
|
"learning_rate": 1.7999278089449108e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 48983655.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 573
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675973355770111,
|
|
"advantages/var": 0.21864726823871994,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.6480286738351255,
|
|
"grad_norm": 0.0976815205011024,
|
|
"learning_rate": 1.7992520670375625e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 49071387.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.11100948601961136,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 574
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.516894993554525e-09,
|
|
"advantages/std": 0.6185593605041504,
|
|
"advantages/var": 0.3826156824673035,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.65089605734767,
|
|
"grad_norm": 0.17323239068739085,
|
|
"learning_rate": 1.7985753131951614e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 49147131.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.17517907917499542,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 575
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.60983260462163e-08,
|
|
"advantages/std": 0.40496495366096497,
|
|
"advantages/var": 0.1639966136936275,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.653763440860215,
|
|
"grad_norm": 0.0841651778614561,
|
|
"learning_rate": 1.7978975482745477e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 49227436.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.0936255231499672,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 576
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.252522235212141e-09,
|
|
"advantages/std": 0.5726771354675293,
|
|
"advantages/var": 0.3279591014872949,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.6566308243727599,
|
|
"grad_norm": 0.1548230898091453,
|
|
"learning_rate": 1.7972187731338409e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 49307675.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.1530819982290268,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 577
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 6.970961834751672e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.6594982078853047,
|
|
"grad_norm": 0.11624446921524587,
|
|
"learning_rate": 1.7965389886324397e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 49392352.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 578
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.6623655913978495,
|
|
"grad_norm": 0.05701063211512371,
|
|
"learning_rate": 1.7958581956310214e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 49475549.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 579
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.65625,
|
|
"epoch": 1.6652329749103942,
|
|
"grad_norm": 0.09897261256162439,
|
|
"learning_rate": 1.7951763949915398e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 49566450.0,
|
|
"reward": 0.609375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.609375,
|
|
"rewards/drgrpo_math_reward/std": 0.4898075461387634,
|
|
"step": 580
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 1.267469408556425e-08,
|
|
"advantages/std": 0.330655038356781,
|
|
"advantages/var": 0.10933275439072432,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.668100358422939,
|
|
"grad_norm": 0.056827008086541814,
|
|
"learning_rate": 1.7944935875772242e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 49644577.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 581
|
|
},
|
|
{
|
|
"advantages/mean": 7.450580596923828e-09,
|
|
"advantages/snr": 1.062247430213908e-08,
|
|
"advantages/std": 0.7013978362083435,
|
|
"advantages/var": 0.49195892463774626,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 1.6709677419354838,
|
|
"grad_norm": 0.21157890645747224,
|
|
"learning_rate": 1.7938097742525788e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 49741380.0,
|
|
"reward": 0.53125,
|
|
"reward_std": 0.25118327140808105,
|
|
"rewards/drgrpo_math_reward/mean": 0.53125,
|
|
"rewards/drgrpo_math_reward/std": 0.5009832978248596,
|
|
"step": 582
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227856636047363,
|
|
"advantages/var": 0.27330485007064453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.6738351254480288,
|
|
"grad_norm": 0.113043689959348,
|
|
"learning_rate": 1.7931249558833813e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 49824375.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.13098980486392975,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 583
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.6767025089605734,
|
|
"grad_norm": 0.08622293290280962,
|
|
"learning_rate": 1.792439133336682e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 49894013.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 584
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.6721649001610904e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.6795698924731184,
|
|
"grad_norm": 0.09241404849302579,
|
|
"learning_rate": 1.7917523074808022e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 49980856.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 585
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.682437275985663,
|
|
"grad_norm": 0.08753320201736134,
|
|
"learning_rate": 1.7910644791853345e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 50052352.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 586
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.50555013211365e-09,
|
|
"advantages/std": 0.6185928583145142,
|
|
"advantages/var": 0.3826571243577206,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.685304659498208,
|
|
"grad_norm": 0.12004129638171342,
|
|
"learning_rate": 1.79037564932114e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 50148699.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.2161829173564911,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 587
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917480653229804e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.6881720430107527,
|
|
"grad_norm": 0.07962124500314915,
|
|
"learning_rate": 1.7896858187603474e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 50214726.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 588
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.379887186086637e-08,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.6910394265232975,
|
|
"grad_norm": 0.0806365501054856,
|
|
"learning_rate": 1.788994988376353e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 50299840.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 589
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.6939068100358423,
|
|
"grad_norm": 0.07540264432819178,
|
|
"learning_rate": 1.7883031590438194e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 50383747.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 590
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.5056078712314432e-09,
|
|
"advantages/std": 0.6185691356658936,
|
|
"advantages/var": 0.3826277755984506,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.696774193548387,
|
|
"grad_norm": 0.1138498837367993,
|
|
"learning_rate": 1.7876103316386727e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 50469109.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.18543371558189392,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 591
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 8.450146530707848e-09,
|
|
"advantages/std": 0.6612826585769653,
|
|
"advantages/var": 0.4372947545346193,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.6996415770609319,
|
|
"grad_norm": 0.13635243870664232,
|
|
"learning_rate": 1.7869165070381043e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 50552794.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.21884137392044067,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 592
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 8.907227504745508e-10,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.7025089605734767,
|
|
"grad_norm": 0.10518691727099587,
|
|
"learning_rate": 1.7862216861205667e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 50627662.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 593
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299802498719973e-09,
|
|
"advantages/std": 0.4049576222896576,
|
|
"advantages/var": 0.16399067585049298,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.7053763440860215,
|
|
"grad_norm": 0.08787262259127059,
|
|
"learning_rate": 1.7855258697657746e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 50712403.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.08891239762306213,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 594
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.7082437275985662,
|
|
"grad_norm": 0.06658110423735736,
|
|
"learning_rate": 1.7848290588547026e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 50800496.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 595
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 1.7111111111111112,
|
|
"grad_norm": 0.036404286414256665,
|
|
"learning_rate": 1.7841312542695848e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 50882390.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 596
|
|
},
|
|
{
|
|
"advantages/mean": 5.122274160385132e-09,
|
|
"advantages/snr": 1.0954489382432772e-08,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.7139784946236558,
|
|
"grad_norm": 0.09944774588564752,
|
|
"learning_rate": 1.7834324568939136e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 50957746.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 597
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 1.7168458781362008,
|
|
"grad_norm": 0.04898368888650664,
|
|
"learning_rate": 1.782732667612438e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 51036336.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 598
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.979230209351863e-09,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.7197132616487454,
|
|
"grad_norm": 0.07891434988260301,
|
|
"learning_rate": 1.7820318873111626e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 51115891.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 599
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.24699507424554e-08,
|
|
"advantages/std": 0.5227974057197571,
|
|
"advantages/var": 0.2733171274273083,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.7225806451612904,
|
|
"grad_norm": 0.10373118203585466,
|
|
"learning_rate": 1.7813301168773478e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 51195517.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.14465448260307312,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 600
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983629174425397e-09,
|
|
"advantages/std": 0.46757492423057556,
|
|
"advantages/var": 0.21862630976922848,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.7254480286738352,
|
|
"grad_norm": 0.08823420239986755,
|
|
"learning_rate": 1.7806273571995065e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 51274101.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 601
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-09,
|
|
"advantages/snr": 1.6262067861863043e-08,
|
|
"advantages/std": 0.5726962685585022,
|
|
"advantages/var": 0.3279810160208321,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.72831541218638,
|
|
"grad_norm": 0.11082640603899152,
|
|
"learning_rate": 1.7799236091674045e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 51361658.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.17358636856079102,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 602
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 1.7311827956989247,
|
|
"grad_norm": 0.028673913004331607,
|
|
"learning_rate": 1.779218873672059e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 51443992.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 603
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 8.049402934764073e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.7340501792114695,
|
|
"grad_norm": 0.08853857547337998,
|
|
"learning_rate": 1.7785131516057374e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 51514409.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 604
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 1.7369175627240143,
|
|
"grad_norm": 0.08212926446077826,
|
|
"learning_rate": 1.7778064438619559e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 51606124.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.11678344756364822,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 605
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983467187183905e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.739784946236559,
|
|
"grad_norm": 0.08105299644208669,
|
|
"learning_rate": 1.7770987513354796e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 51690944.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 606
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.065668450848788e-09,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 1.742652329749104,
|
|
"grad_norm": 0.09925762413655309,
|
|
"learning_rate": 1.7763900749223194e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 51786435.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.14966703951358795,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 607
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.13111686481873e-09,
|
|
"advantages/std": 0.5726904273033142,
|
|
"advantages/var": 0.3279743255248526,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.7455197132616487,
|
|
"grad_norm": 0.1563651934669506,
|
|
"learning_rate": 1.7756804155197322e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 51865124.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.16781240701675415,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 608
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 7.966936970044213e-09,
|
|
"advantages/std": 0.7013906836509705,
|
|
"advantages/var": 0.4919488911123757,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 1.7483870967741937,
|
|
"grad_norm": 0.12125368324296161,
|
|
"learning_rate": 1.7749697740262195e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 51958950.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.23752352595329285,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 609
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 7.96693437436781e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.7512544802867382,
|
|
"grad_norm": 0.08696671817760408,
|
|
"learning_rate": 1.7742581513415265e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 52035790.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 610
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.011171929325739e-09,
|
|
"advantages/std": 0.6185781359672546,
|
|
"advantages/var": 0.38263891029672337,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.7541218637992833,
|
|
"grad_norm": 0.14584663912796295,
|
|
"learning_rate": 1.7735455483666404e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 52129039.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.19780512154102325,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 611
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.264868475059852e-08,
|
|
"advantages/std": 0.40496495366096497,
|
|
"advantages/var": 0.1639966136936275,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.7569892473118278,
|
|
"grad_norm": 0.08213421867548132,
|
|
"learning_rate": 1.7728319660037897e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 52202394.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0936255231499672,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 612
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.7598566308243728,
|
|
"grad_norm": 0.11010447507234923,
|
|
"learning_rate": 1.7721174051564426e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 52280139.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.12415501475334167,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 613
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 7.31820321152782e-09,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.7627240143369176,
|
|
"grad_norm": 0.0884489361838935,
|
|
"learning_rate": 1.771401866729307e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 52362540.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 614
|
|
},
|
|
{
|
|
"advantages/mean": -8.381903171539307e-09,
|
|
"advantages/snr": 1.2675148962721033e-08,
|
|
"advantages/std": 0.6612863540649414,
|
|
"advantages/var": 0.43729964207250305,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.7655913978494624,
|
|
"grad_norm": 0.1390975581270697,
|
|
"learning_rate": 1.7706853516283269e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 52450800.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.2222612351179123,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 615
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.294576256939553e-08,
|
|
"advantages/std": 0.46761220693588257,
|
|
"advantages/var": 0.21866117607544666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 1.7684587813620072,
|
|
"grad_norm": 0.08340422526536204,
|
|
"learning_rate": 1.7699678607606848e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 52539594.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.1236182302236557,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 616
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 9.958206547585018e-10,
|
|
"advantages/std": 0.46761560440063477,
|
|
"advantages/var": 0.21866435347897095,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 1.771326164874552,
|
|
"grad_norm": 0.08804980976750112,
|
|
"learning_rate": 1.7692493950347968e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 52619699.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.12809400260448456,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 617
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.970961834751672e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.7741935483870968,
|
|
"grad_norm": 0.09704645442338684,
|
|
"learning_rate": 1.768529955360315e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 52693760.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 618
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125814501076877e-09,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 1.7770609318996415,
|
|
"grad_norm": 0.09728208620780422,
|
|
"learning_rate": 1.7678095426481235e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 52785410.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 619
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9916745663410504e-09,
|
|
"advantages/std": 0.4676077961921692,
|
|
"advantages/var": 0.21865705105969724,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.7799283154121865,
|
|
"grad_norm": 0.0590535191981089,
|
|
"learning_rate": 1.7670881578103383e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 52870139.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.12125921994447708,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 620
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.626265687704098e-09,
|
|
"advantages/std": 0.5726755261421204,
|
|
"advantages/var": 0.3279572582421544,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 1.782795698924731,
|
|
"grad_norm": 0.11417855950003987,
|
|
"learning_rate": 1.7663658017603072e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 52966074.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.15072788298130035,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 621
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344336502847305e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 1.7856630824372761,
|
|
"grad_norm": 0.10528707358064278,
|
|
"learning_rate": 1.7656424754126064e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 53053990.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.13781969249248505,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 622
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125731633597449e-09,
|
|
"advantages/std": 0.5227940678596497,
|
|
"advantages/var": 0.27331363738923997,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 1.7885304659498207,
|
|
"grad_norm": 0.16258954346364168,
|
|
"learning_rate": 1.7649181796830415e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 53144189.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.1433562934398651,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 623
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.2583643671009597e-09,
|
|
"advantages/std": 0.6185821294784546,
|
|
"advantages/var": 0.38264385091009956,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.7913978494623657,
|
|
"grad_norm": 0.1303926158518905,
|
|
"learning_rate": 1.7641929154886453e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 53228553.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.20463499426841736,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 624
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.958429318795977e-10,
|
|
"advantages/std": 0.4676051437854767,
|
|
"advantages/var": 0.21865457049463632,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.7942652329749103,
|
|
"grad_norm": 0.1150785865190137,
|
|
"learning_rate": 1.7634666837476763e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 53314514.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.11784426867961884,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 625
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983473280509385e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.7971326164874553,
|
|
"grad_norm": 0.08192531188502497,
|
|
"learning_rate": 1.7627394853796184e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 53394009.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 626
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.0111745406690435e-09,
|
|
"advantages/std": 0.6185775995254517,
|
|
"advantages/var": 0.38263824663467005,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 1.8,
|
|
"grad_norm": 0.1163617284955123,
|
|
"learning_rate": 1.7620113213051795e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 53494339.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.19674427807331085,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 627
|
|
},
|
|
{
|
|
"advantages/mean": 6.05359673500061e-09,
|
|
"advantages/snr": 9.786294625565132e-09,
|
|
"advantages/std": 0.6185790300369263,
|
|
"advantages/var": 0.38264001640142453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.8028673835125448,
|
|
"grad_norm": 0.12523098720127152,
|
|
"learning_rate": 1.76128219244629e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 53580382.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.1990984082221985,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 628
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.3443230978138685e-09,
|
|
"advantages/std": 0.5227916836738586,
|
|
"advantages/var": 0.2733111445185479,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.8057347670250896,
|
|
"grad_norm": 0.10023370396243167,
|
|
"learning_rate": 1.7605520997261011e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 53668316.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.13994136452674866,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 629
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.974957819673592e-09,
|
|
"advantages/std": 0.46761295199394226,
|
|
"advantages/var": 0.21866187287248895,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.8086021505376344,
|
|
"grad_norm": 0.2227891417965325,
|
|
"learning_rate": 1.7598210440689857e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 53738437.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.1246790662407875,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 630
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.126037115417672e-09,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.8114695340501792,
|
|
"grad_norm": 0.17391472702432173,
|
|
"learning_rate": 1.7590890264005354e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 53829447.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.1173202320933342,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 631
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262736431211962e-09,
|
|
"advantages/std": 0.5726727247238159,
|
|
"advantages/var": 0.32795404964259944,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.814336917562724,
|
|
"grad_norm": 0.172820094067218,
|
|
"learning_rate": 1.7583560476475587e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 53901473.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.1462521106004715,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 632
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 7.966544250856589e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.817204301075269,
|
|
"grad_norm": 0.050725545773023864,
|
|
"learning_rate": 1.757622108738083e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 53966130.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 633
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7814855139419146e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.8200716845878135,
|
|
"grad_norm": 0.09921205484552398,
|
|
"learning_rate": 1.756887210601349e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 54045729.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.12415501475334167,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 634
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7814193005215579e-09,
|
|
"advantages/std": 0.5227980613708496,
|
|
"advantages/var": 0.27331781297311863,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.8229390681003586,
|
|
"grad_norm": 0.11264600133960613,
|
|
"learning_rate": 1.7561513541678141e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 54130690.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.14571532607078552,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 635
|
|
},
|
|
{
|
|
"advantages/mean": 4.6566128730773926e-09,
|
|
"advantages/snr": 8.131160024959952e-09,
|
|
"advantages/std": 0.5726873874664307,
|
|
"advantages/var": 0.3279708437631257,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 1.8258064516129031,
|
|
"grad_norm": 0.12736837348410912,
|
|
"learning_rate": 1.7554145403691473e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 54217523.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.16651421785354614,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 636
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.247008722823236e-08,
|
|
"advantages/std": 0.5227916836738586,
|
|
"advantages/var": 0.2733111445185479,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 1.8286738351254481,
|
|
"grad_norm": 0.12924698284111052,
|
|
"learning_rate": 1.7546767701382308e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 54301592.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.13994136452674866,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 637
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344354173221399e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 1.8315412186379927,
|
|
"grad_norm": 0.09778556658427673,
|
|
"learning_rate": 1.7539380444091571e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 54391954.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 638
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.5629027821475993e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 1.8344086021505377,
|
|
"grad_norm": 0.11683676138661579,
|
|
"learning_rate": 1.753198364117229e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 54481350.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 639
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.5995914574054795e-09,
|
|
"advantages/std": 0.4049588143825531,
|
|
"advantages/var": 0.1639916413461231,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.8372759856630825,
|
|
"grad_norm": 0.07753623795481836,
|
|
"learning_rate": 1.7524577301989572e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 54552292.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0867956355214119,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 640
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.0689006924160064e-08,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.8401433691756273,
|
|
"grad_norm": 0.11752012585465188,
|
|
"learning_rate": 1.7517161435920605e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 54632333.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 641
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016541313711486e-09,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.843010752688172,
|
|
"grad_norm": 0.09296534465920685,
|
|
"learning_rate": 1.7509736052354632e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 54715302.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 642
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.8458781362007168,
|
|
"grad_norm": 0.11737325425319424,
|
|
"learning_rate": 1.750230116069295e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 54796941.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 643
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.439391676409106e-09,
|
|
"advantages/std": 0.5726771354675293,
|
|
"advantages/var": 0.3279591014872949,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.8487455197132616,
|
|
"grad_norm": 0.10768871696196604,
|
|
"learning_rate": 1.7494856770348903e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 54877517.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.1530819982290268,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 644
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.3444254652277355e-09,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.8516129032258064,
|
|
"grad_norm": 0.13384002189732117,
|
|
"learning_rate": 1.748740289074784e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 54947818.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 645
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 6.639245037827722e-10,
|
|
"advantages/std": 0.7013768553733826,
|
|
"advantages/var": 0.4919294932534548,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.8544802867383514,
|
|
"grad_norm": 0.18004331924682504,
|
|
"learning_rate": 1.7479939531327144e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 55039411.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.22043409943580627,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 646
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 9.797937967257332e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.857347670250896,
|
|
"grad_norm": 0.09604307611587987,
|
|
"learning_rate": 1.747246670153619e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 55129354.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 647
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.860215053763441,
|
|
"grad_norm": 0.08548359825937912,
|
|
"learning_rate": 1.746498441083635e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 55198994.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 648
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 6.77520147229019e-09,
|
|
"advantages/std": 0.6185722351074219,
|
|
"advantages/var": 0.3826316100457916,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.8630824372759855,
|
|
"grad_norm": 0.15433130304994333,
|
|
"learning_rate": 1.7457492668700967e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 55276732.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.19097033143043518,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 649
|
|
},
|
|
{
|
|
"advantages/mean": 5.122274160385132e-09,
|
|
"advantages/snr": 1.0954289704678737e-08,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.8659498207885306,
|
|
"grad_norm": 0.17622307002850085,
|
|
"learning_rate": 1.7449991484615359e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 55356997.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 650
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.878766595896755e-09,
|
|
"advantages/std": 0.5726791024208069,
|
|
"advantages/var": 0.327961354349501,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 1.8688172043010751,
|
|
"grad_norm": 0.10472142185864963,
|
|
"learning_rate": 1.7442480868076789e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 55446748.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.15308690071105957,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 651
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.8716845878136201,
|
|
"grad_norm": 0.07288459873608755,
|
|
"learning_rate": 1.743496082859447e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 55525801.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 652
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917480653229804e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.874551971326165,
|
|
"grad_norm": 0.08759904484132258,
|
|
"learning_rate": 1.7427431375689543e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 55598059.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 653
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.8774193548387097,
|
|
"grad_norm": 0.07013332630729088,
|
|
"learning_rate": 1.7419892518895067e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 55682674.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 654
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917480653229804e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.8802867383512545,
|
|
"grad_norm": 0.09070016569887124,
|
|
"learning_rate": 1.7412344267756009e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 55766402.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 655
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966756148857264e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.8831541218637993,
|
|
"grad_norm": 0.12727931385893082,
|
|
"learning_rate": 1.7404786631829226e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 55843660.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 656
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.8788021410185465e-09,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.886021505376344,
|
|
"grad_norm": 0.10677855855478481,
|
|
"learning_rate": 1.7397219620683463e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 55919559.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 657
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199317639730369e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.8888888888888888,
|
|
"grad_norm": 0.16188825238468785,
|
|
"learning_rate": 1.738964324389933e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 55991862.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 658
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.63344972278721e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.8917562724014338,
|
|
"grad_norm": 0.08240144468938987,
|
|
"learning_rate": 1.7382057511069296e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 56059473.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 659
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.6721323019547286e-09,
|
|
"advantages/std": 0.5227974057197571,
|
|
"advantages/var": 0.2733171274273083,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.8946236559139784,
|
|
"grad_norm": 0.10283567421926562,
|
|
"learning_rate": 1.737446243179768e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 56131773.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.14465448260307312,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 660
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.8974910394265234,
|
|
"grad_norm": 0.07697940771219978,
|
|
"learning_rate": 1.7366858015700624e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 56216211.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 661
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 3.983471193166227e-09,
|
|
"advantages/std": 0.7013902068138123,
|
|
"advantages/var": 0.49194822221432233,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.900358422939068,
|
|
"grad_norm": 0.1300291889631906,
|
|
"learning_rate": 1.7359244272406107e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 56321214.0,
|
|
"reward": 0.5859375,
|
|
"reward_std": 0.23646268248558044,
|
|
"rewards/drgrpo_math_reward/mean": 0.5859375,
|
|
"rewards/drgrpo_math_reward/std": 0.49449479579925537,
|
|
"step": 662
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.903225806451613,
|
|
"grad_norm": 0.05520265524593956,
|
|
"learning_rate": 1.73516212115539e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 56401868.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 663
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.672108843924502e-09,
|
|
"advantages/std": 0.5228019952774048,
|
|
"advantages/var": 0.2733219262660356,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.9060931899641576,
|
|
"grad_norm": 0.13582013022578326,
|
|
"learning_rate": 1.7343988842795584e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 56493224.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.14807432889938354,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 664
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.970744583309513e-09,
|
|
"advantages/std": 0.46761560440063477,
|
|
"advantages/var": 0.21866435347897095,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.9089605734767026,
|
|
"grad_norm": 0.13227371644183966,
|
|
"learning_rate": 1.7336347175794521e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 56565927.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.12809400260448456,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 665
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.0539355639134917e-08,
|
|
"advantages/std": 0.6185632348060608,
|
|
"advantages/var": 0.3826204754537379,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.9118279569892473,
|
|
"grad_norm": 0.12795985341564062,
|
|
"learning_rate": 1.7328696220225845e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 56652668.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.17859894037246704,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 666
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 6.8993206715259576e-09,
|
|
"advantages/std": 0.4049627184867859,
|
|
"advantages/var": 0.1639948033642078,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.703125,
|
|
"epoch": 1.9146953405017921,
|
|
"grad_norm": 0.07328231464595761,
|
|
"learning_rate": 1.732103598577645e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 56749515.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.09127141535282135,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 667
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.917562724014337,
|
|
"grad_norm": 0.12914763253719083,
|
|
"learning_rate": 1.7313366482144973e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 56833761.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 668
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899474012416308e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.9204301075268817,
|
|
"grad_norm": 0.07890408596078594,
|
|
"learning_rate": 1.7305687719041798e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 56919886.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.08443662524223328,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 669
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.9232974910394265,
|
|
"grad_norm": 0.09684022294367929,
|
|
"learning_rate": 1.7297999706189025e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 56989434.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 670
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 7.528014692854944e-09,
|
|
"advantages/std": 0.6185711622238159,
|
|
"advantages/var": 0.3826302827349224,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.9261648745519713,
|
|
"grad_norm": 0.21154492643552217,
|
|
"learning_rate": 1.7290302453320465e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 57073367.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.18884867429733276,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 671
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 7.528079978419127e-10,
|
|
"advantages/std": 0.6185657978057861,
|
|
"advantages/var": 0.3826236462151087,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.9290322580645163,
|
|
"grad_norm": 0.14458294656914567,
|
|
"learning_rate": 1.7282595970181628e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 57148824.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.1830746978521347,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 672
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966756148857264e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.9318996415770608,
|
|
"grad_norm": 0.1066687707439637,
|
|
"learning_rate": 1.7274880266529715e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 57235947.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 673
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.9347670250896059,
|
|
"grad_norm": 0.08970681835046851,
|
|
"learning_rate": 1.7267155352133598e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 57314343.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 674
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.9376344086021504,
|
|
"grad_norm": 0.037950397904259026,
|
|
"learning_rate": 1.7259421236773806e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 57398252.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 675
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.0954734451444e-08,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 1.9405017921146954,
|
|
"grad_norm": 0.07044606949811252,
|
|
"learning_rate": 1.7251677930242524e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 57472101.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.09863808006048203,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 676
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.1312150336472e-10,
|
|
"advantages/std": 0.5726835131645203,
|
|
"advantages/var": 0.32796640625045725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.94336917562724,
|
|
"grad_norm": 0.13746445766784376,
|
|
"learning_rate": 1.7243925442343575e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 57574344.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 677
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.344384639658041e-09,
|
|
"advantages/std": 0.5227856636047363,
|
|
"advantages/var": 0.27330485007064453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 1.946236559139785,
|
|
"grad_norm": 0.11281681887095754,
|
|
"learning_rate": 1.7236163782892402e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 57650886.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.13098981976509094,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 678
|
|
},
|
|
{
|
|
"advantages/mean": 4.6566128730773926e-09,
|
|
"advantages/snr": 7.52801541824388e-09,
|
|
"advantages/std": 0.6185711026191711,
|
|
"advantages/var": 0.38263020899549716,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 1.9491039426523298,
|
|
"grad_norm": 0.11326927243511467,
|
|
"learning_rate": 1.7228392961716058e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 57739335.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.18884865939617157,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 679
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5228016972541809,
|
|
"advantages/var": 0.27332161465185223,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.9519713261648746,
|
|
"grad_norm": 0.1003676774596821,
|
|
"learning_rate": 1.72206129886532e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 57820745.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.15125194191932678,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 680
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125868122237681e-09,
|
|
"advantages/std": 0.5227840542793274,
|
|
"advantages/var": 0.27330316740873073,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.9548387096774194,
|
|
"grad_norm": 0.11503852642526637,
|
|
"learning_rate": 1.7212823873554077e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 57900871.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.13204574584960938,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 681
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.9577060931899641,
|
|
"grad_norm": 0.050713752984807875,
|
|
"learning_rate": 1.72050256262805e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 57992278.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 682
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 1.2674839170101503e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.960573476702509,
|
|
"grad_norm": 0.04396861649262558,
|
|
"learning_rate": 1.7197218256705857e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 58073213.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 683
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 3.764055585405327e-09,
|
|
"advantages/std": 0.6185632348060608,
|
|
"advantages/var": 0.3826204754537379,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 1.9634408602150537,
|
|
"grad_norm": 0.11596307369386666,
|
|
"learning_rate": 1.7189401774715072e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 58166340.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.17859892547130585,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 684
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.9663082437275987,
|
|
"grad_norm": 0.06345341305669575,
|
|
"learning_rate": 1.7181576190204616e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 58246745.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 685
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.453574654603735e-09,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.9691756272401433,
|
|
"grad_norm": 0.15361994081883612,
|
|
"learning_rate": 1.7173741513082478e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 58326136.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 686
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 1.9720430107526883,
|
|
"grad_norm": 0.05207869475321715,
|
|
"learning_rate": 1.7165897753268165e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 58407208.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 687
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 1.1266652673742488e-08,
|
|
"advantages/std": 0.33064746856689453,
|
|
"advantages/var": 0.10932774846969551,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.9749103942652328,
|
|
"grad_norm": 0.11293308439040252,
|
|
"learning_rate": 1.7158044920692682e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 58490331.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 688
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 1.9777777777777779,
|
|
"grad_norm": 0.08567692143451083,
|
|
"learning_rate": 1.7150183025298517e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 58568099.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 689
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.5056078712314432e-09,
|
|
"advantages/std": 0.6185691356658936,
|
|
"advantages/var": 0.3826277755984506,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.9806451612903224,
|
|
"grad_norm": 0.15087248324596125,
|
|
"learning_rate": 1.7142312077039638e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 58644861.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.18543371558189392,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 690
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.198954094593892e-09,
|
|
"advantages/std": 0.40496888756752014,
|
|
"advantages/var": 0.16399979989767477,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 1.9835125448028674,
|
|
"grad_norm": 0.08803479172788438,
|
|
"learning_rate": 1.7134432085881469e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 58728309.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.09810129553079605,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 691
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 1.0348901800453944e-08,
|
|
"advantages/std": 0.4049658179283142,
|
|
"advantages/var": 0.16399731369034853,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 1.9863799283154122,
|
|
"grad_norm": 0.09670088622860404,
|
|
"learning_rate": 1.7126543061800893e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 58805589.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.0946863517165184,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 692
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.0954734451444e-08,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 1.989247311827957,
|
|
"grad_norm": 0.09839509805314722,
|
|
"learning_rate": 1.7118645014786216e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 58874408.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 693
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 1.9921146953405018,
|
|
"grad_norm": 0.0651367992382115,
|
|
"learning_rate": 1.7110737954837182e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 58947583.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 694
|
|
},
|
|
{
|
|
"advantages/mean": -2.0954757928848267e-09,
|
|
"advantages/snr": 3.168766962023389e-09,
|
|
"advantages/std": 0.6612905859947205,
|
|
"advantages/var": 0.4373052391252408,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 1.9949820788530466,
|
|
"grad_norm": 0.13001653169016658,
|
|
"learning_rate": 1.7102821891964933e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 59036599.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.2301519364118576,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 695
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199317639730369e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 1.9978494623655914,
|
|
"grad_norm": 0.13689238215628702,
|
|
"learning_rate": 1.7094896836192021e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 59112092.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 696
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 5.0387450601867105e-09,
|
|
"advantages/std": 0.7393289804458618,
|
|
"advantages/var": 0.5466073413271175,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 2.002867383512545,
|
|
"grad_norm": 0.14738650917438545,
|
|
"learning_rate": 1.7086962797552372e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 59210297.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.2619746923446655,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 697
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.1579757501173404e-08,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.0057347670250896,
|
|
"grad_norm": 0.10589172183649662,
|
|
"learning_rate": 1.7079019786091304e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 59274800.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 698
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599659158374989e-09,
|
|
"advantages/std": 0.40495285391807556,
|
|
"advantages/var": 0.16398681389639425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.0086021505376346,
|
|
"grad_norm": 0.10012153562307823,
|
|
"learning_rate": 1.7071067811865474e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 59351944.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 699
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7815330444009752e-09,
|
|
"advantages/std": 0.5227646827697754,
|
|
"advantages/var": 0.2732829135513839,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.011469534050179,
|
|
"grad_norm": 0.07468079317511114,
|
|
"learning_rate": 1.7063106884942902e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 59431785.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.11048543453216553,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 700
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599795920412534e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 2.014336917562724,
|
|
"grad_norm": 0.06906182952801246,
|
|
"learning_rate": 1.7055137015402932e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 59517257.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 701
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.0172043010752687,
|
|
"grad_norm": 0.08192055329133872,
|
|
"learning_rate": 1.7047158213336241e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 59593235.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 702
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 7.528008164360805e-09,
|
|
"advantages/std": 0.6185716986656189,
|
|
"advantages/var": 0.3826309463900692,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.0200716845878137,
|
|
"grad_norm": 0.12550527758126853,
|
|
"learning_rate": 1.703917048884481e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 59676199.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.18990948796272278,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 703
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.3442097670369564e-09,
|
|
"advantages/std": 0.5228027701377869,
|
|
"advantages/var": 0.2733227364637436,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.0229390681003583,
|
|
"grad_norm": 0.13322333703758604,
|
|
"learning_rate": 1.7031173852041914e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 59762217.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.15254521369934082,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 704
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.0258064516129033,
|
|
"grad_norm": 0.04565102881553923,
|
|
"learning_rate": 1.7023168313052115e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 59857822.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 705
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917813257124117e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.028673835125448,
|
|
"grad_norm": 0.09427426790565954,
|
|
"learning_rate": 1.701515388201125e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 59933853.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 706
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983473280509385e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.031541218637993,
|
|
"grad_norm": 0.15518696240852958,
|
|
"learning_rate": 1.700713056906641e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 60010800.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 707
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299773557175319e-09,
|
|
"advantages/std": 0.4049627184867859,
|
|
"advantages/var": 0.1639948033642078,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.0344086021505374,
|
|
"grad_norm": 0.06849252314183808,
|
|
"learning_rate": 1.6999098384375928e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 60094064.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.09127141535282135,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 708
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 4.878651331707719e-09,
|
|
"advantages/std": 0.5726926326751709,
|
|
"advantages/var": 0.3279768515204182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.0372759856630824,
|
|
"grad_norm": 0.1139504153736027,
|
|
"learning_rate": 1.6991057338109374e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 60182913.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.1712273508310318,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 709
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 6.504900262044325e-09,
|
|
"advantages/std": 0.5726898312568665,
|
|
"advantages/var": 0.3279736428250182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.0401433691756274,
|
|
"grad_norm": 0.11041081468071696,
|
|
"learning_rate": 1.6983007440447539e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 60277855.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.16675157845020294,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 710
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917699002625455e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 2.043010752688172,
|
|
"grad_norm": 0.11390562174606526,
|
|
"learning_rate": 1.6974948701582417e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 60374112.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 711
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814432667740602e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.045878136200717,
|
|
"grad_norm": 0.09113302680893781,
|
|
"learning_rate": 1.6966881131717196e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 60458057.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 712
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 4.599795920412534e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.0487455197132616,
|
|
"grad_norm": 0.09052835603090768,
|
|
"learning_rate": 1.6958804741066252e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 60540298.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 713
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917813257124117e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.0516129032258066,
|
|
"grad_norm": 0.11208448837819521,
|
|
"learning_rate": 1.695071953985512e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 60616944.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.09522313624620438,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 714
|
|
},
|
|
{
|
|
"advantages/mean": 5.122274160385132e-09,
|
|
"advantages/snr": 1.09543064605741e-08,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.054480286738351,
|
|
"grad_norm": 0.11787645732740036,
|
|
"learning_rate": 1.6942625538320492e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 60709383.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 715
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983539800525091e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.057347670250896,
|
|
"grad_norm": 0.08071633968502825,
|
|
"learning_rate": 1.6934522746710204e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 60797237.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 716
|
|
},
|
|
{
|
|
"advantages/mean": 4.6566128730773926e-09,
|
|
"advantages/snr": 8.131003465846817e-09,
|
|
"advantages/std": 0.5726984143257141,
|
|
"advantages/var": 0.3279834737711873,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.0602150537634407,
|
|
"grad_norm": 0.15739203678265482,
|
|
"learning_rate": 1.6926411175283227e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 60890581.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.17700129747390747,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 717
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 7.3180942919464596e-09,
|
|
"advantages/std": 0.5726834535598755,
|
|
"advantages/var": 0.32796633798126607,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 2.0630824372759857,
|
|
"grad_norm": 0.14517907380286715,
|
|
"learning_rate": 1.6918290834309631e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 60983682.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 718
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299811976811062e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.0659498207885303,
|
|
"grad_norm": 0.10471779058716572,
|
|
"learning_rate": 1.691016173407061e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 61065503.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 719
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917366402546925e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.0688172043010753,
|
|
"grad_norm": 0.07649994559337656,
|
|
"learning_rate": 1.6902023884858436e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 61149911.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 720
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.87871074032323e-09,
|
|
"advantages/std": 0.5726856589317322,
|
|
"advantages/var": 0.3279688639460723,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 2.07168458781362,
|
|
"grad_norm": 0.1380323189643192,
|
|
"learning_rate": 1.6893877296976457e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 61233905.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.16333171725273132,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 721
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344354173221399e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.074551971326165,
|
|
"grad_norm": 0.11343862959636024,
|
|
"learning_rate": 1.6885721980739089e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 61318503.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 722
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 5.7495735248314805e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.07741935483871,
|
|
"grad_norm": 0.08660213837979328,
|
|
"learning_rate": 1.6877557946471805e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 61399464.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 723
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.3444321679928155e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.0802867383512544,
|
|
"grad_norm": 0.12595960129785244,
|
|
"learning_rate": 1.68693852045111e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 61475702.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 724
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917366402546925e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.0831541218637994,
|
|
"grad_norm": 0.14328105313741246,
|
|
"learning_rate": 1.6861203765204508e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 61546127.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 725
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.2674839170101503e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.086021505376344,
|
|
"grad_norm": 0.0811799210414726,
|
|
"learning_rate": 1.685301363891057e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 61636503.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 726
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344227436572988e-09,
|
|
"advantages/std": 0.5228010416030884,
|
|
"advantages/var": 0.27332092910127415,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 2.088888888888889,
|
|
"grad_norm": 0.10775212757040224,
|
|
"learning_rate": 1.6844814835998825e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 61730092.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.15019109845161438,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 727
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9916557799070046e-09,
|
|
"advantages/std": 0.46761220693588257,
|
|
"advantages/var": 0.21866117607544666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.0917562724014336,
|
|
"grad_norm": 0.1057920850104567,
|
|
"learning_rate": 1.6836607366849796e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 61817106.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.1236182376742363,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 728
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.0946236559139786,
|
|
"grad_norm": 0.06764032154212377,
|
|
"learning_rate": 1.6828391241854981e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 61894528.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 729
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.6722160839964078e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 2.097491039426523,
|
|
"grad_norm": 0.14158381613631227,
|
|
"learning_rate": 1.6820166471416841e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 61984766.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 730
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.9792225930357895e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.100358422939068,
|
|
"grad_norm": 0.13341958124051298,
|
|
"learning_rate": 1.6811933065948773e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 62066304.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 731
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.1032258064516127,
|
|
"grad_norm": 0.10447308299187771,
|
|
"learning_rate": 1.6803691035875117e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 62144821.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 732
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.1060931899641577,
|
|
"grad_norm": 0.1208101716876214,
|
|
"learning_rate": 1.6795440391631122e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 62220125.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.12415502220392227,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 733
|
|
},
|
|
{
|
|
"advantages/mean": 4.6566128730773926e-09,
|
|
"advantages/snr": 8.131125327555407e-09,
|
|
"advantages/std": 0.5726898312568665,
|
|
"advantages/var": 0.3279736428250182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.1089605734767023,
|
|
"grad_norm": 0.11976838424289314,
|
|
"learning_rate": 1.6787181143662953e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 62305754.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.16675157845020294,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 734
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 3.520874077163261e-09,
|
|
"advantages/std": 0.661286473274231,
|
|
"advantages/var": 0.4372997997354702,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.6875,
|
|
"epoch": 2.1118279569892473,
|
|
"grad_norm": 0.116480809225144,
|
|
"learning_rate": 1.6778913302427666e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 62397252.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.22567126154899597,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 735
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262335283006049e-09,
|
|
"advantages/std": 0.5726868510246277,
|
|
"advantages/var": 0.3279702293365041,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.71875,
|
|
"epoch": 2.1146953405017923,
|
|
"grad_norm": 0.1915343604730967,
|
|
"learning_rate": 1.6770636878393191e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 62491536.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.16545340418815613,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 736
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599796258942519e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 2.117562724014337,
|
|
"grad_norm": 0.11750821923740856,
|
|
"learning_rate": 1.676235188203834e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 62570395.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 737
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.120430107526882,
|
|
"grad_norm": 0.0702618672546427,
|
|
"learning_rate": 1.6754058323852753e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 62650546.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 738
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917366402546925e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.1232974910394264,
|
|
"grad_norm": 0.08698472471690182,
|
|
"learning_rate": 1.6745756214336934e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 62729753.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 739
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.394200364231044e-08,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.1261648745519715,
|
|
"grad_norm": 0.12130414909496527,
|
|
"learning_rate": 1.6737445564002203e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 62802801.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 740
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.5995914574054795e-09,
|
|
"advantages/std": 0.4049588143825531,
|
|
"advantages/var": 0.1639916413461231,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.129032258064516,
|
|
"grad_norm": 0.09371527079704135,
|
|
"learning_rate": 1.6729126383370696e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 62887489.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.0867956355214119,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 741
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 6.504997077293582e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.131899641577061,
|
|
"grad_norm": 0.09466306667377959,
|
|
"learning_rate": 1.6720798682975348e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 62972014.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 742
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966544250856589e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.1347670250896056,
|
|
"grad_norm": 0.028094443208474578,
|
|
"learning_rate": 1.6712462473359876e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 63040350.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 743
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 2.1376344086021506,
|
|
"grad_norm": 0.0827406576698606,
|
|
"learning_rate": 1.6704117765078787e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 63118845.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 744
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.065623173308489e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.140501792114695,
|
|
"grad_norm": 0.11513769535302304,
|
|
"learning_rate": 1.6695764568697328e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 63207076.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.15650182962417603,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 745
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 9.95862671130252e-10,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.14336917562724,
|
|
"grad_norm": 0.14039636884747167,
|
|
"learning_rate": 1.6687402894791506e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 63297581.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 746
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917335935919526e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.146236559139785,
|
|
"grad_norm": 0.10127409709172366,
|
|
"learning_rate": 1.6679032753948055e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 63383169.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 747
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.252531375408196e-09,
|
|
"advantages/std": 0.5726755261421204,
|
|
"advantages/var": 0.3279572582421544,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.1491039426523297,
|
|
"grad_norm": 0.24271622070952423,
|
|
"learning_rate": 1.6670654156764436e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 63466897.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.15072786808013916,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 748
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.516804898616483e-09,
|
|
"advantages/std": 0.6185716986656189,
|
|
"advantages/var": 0.3826309463900692,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 2.1519713261648747,
|
|
"grad_norm": 0.09495054844683182,
|
|
"learning_rate": 1.6662267113848806e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 63568628.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.18990950286388397,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 749
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 4.516801416758646e-09,
|
|
"advantages/std": 0.6185721755027771,
|
|
"advantages/var": 0.3826315363062385,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.1548387096774193,
|
|
"grad_norm": 0.11483586687460479,
|
|
"learning_rate": 1.6653871635820026e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 63658107.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.19097033143043518,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 750
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975166506069069e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.1577060931899643,
|
|
"grad_norm": 0.0819964801308947,
|
|
"learning_rate": 1.6645467733307628e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 63741468.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 751
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.907227504745508e-10,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.160573476702509,
|
|
"grad_norm": 0.12539445836552948,
|
|
"learning_rate": 1.6637055416951817e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 63818006.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 752
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975166506069069e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.163440860215054,
|
|
"grad_norm": 0.10930395045054528,
|
|
"learning_rate": 1.6628634697403444e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 63896332.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 753
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.1125794765316845e-09,
|
|
"advantages/std": 0.6612692475318909,
|
|
"advantages/var": 0.43727701773139316,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 2.1663082437275984,
|
|
"grad_norm": 0.14638189914691208,
|
|
"learning_rate": 1.6620205585324013e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 63990639.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.20069602131843567,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 754
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 9.858529166555944e-09,
|
|
"advantages/std": 0.6612809896469116,
|
|
"advantages/var": 0.43729254726839883,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.1691756272401435,
|
|
"grad_norm": 0.15347012325541226,
|
|
"learning_rate": 1.6611768091385629e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 64072599.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.2188364714384079,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 755
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.0954734451444e-08,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.172043010752688,
|
|
"grad_norm": 0.10962415186479377,
|
|
"learning_rate": 1.6603322226271038e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 64153799.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 756
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.516894993554525e-09,
|
|
"advantages/std": 0.6185593605041504,
|
|
"advantages/var": 0.3826156824673035,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.174910394265233,
|
|
"grad_norm": 0.15211530736200046,
|
|
"learning_rate": 1.659486800067356e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 64236188.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.17517907917499542,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 757
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.4497441148988883e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.1777777777777776,
|
|
"grad_norm": 0.12091419499987931,
|
|
"learning_rate": 1.658640542529712e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 64314952.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 758
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.09543064605741e-08,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 2.1806451612903226,
|
|
"grad_norm": 0.06596850187666084,
|
|
"learning_rate": 1.65779345108562e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 64405738.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 759
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 5.749445740229558e-09,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.183512544802867,
|
|
"grad_norm": 0.1014149366216826,
|
|
"learning_rate": 1.6569455268075853e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 64479877.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 760
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.186379928315412,
|
|
"grad_norm": 0.05411609763188232,
|
|
"learning_rate": 1.656096770769166e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 64567374.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 761
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633244590331673e-09,
|
|
"advantages/std": 0.33065226674079895,
|
|
"advantages/var": 0.10933092150082846,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.189247311827957,
|
|
"grad_norm": 0.057790587081550625,
|
|
"learning_rate": 1.6552471840449752e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 64647877.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.06233953312039375,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 762
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 9.797994938980706e-09,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.1921146953405017,
|
|
"grad_norm": 0.09229350004209458,
|
|
"learning_rate": 1.6543967677106765e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 64725739.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 763
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.1949820788530467,
|
|
"grad_norm": 0.08741759453207316,
|
|
"learning_rate": 1.6535455228429838e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 64804835.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 764
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.4393350594456193e-09,
|
|
"advantages/std": 0.5726904273033142,
|
|
"advantages/var": 0.3279743255248526,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.1978494623655913,
|
|
"grad_norm": 0.10247069099104299,
|
|
"learning_rate": 1.6526934505196605e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 64893347.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.16781240701675415,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 765
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.0112830604957173e-09,
|
|
"advantages/std": 0.6185553073883057,
|
|
"advantages/var": 0.3826106682982413,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.2007168458781363,
|
|
"grad_norm": 0.14126744166687255,
|
|
"learning_rate": 1.6518405518195175e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 64982653.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.16834919154644012,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 766
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.970845876775471e-09,
|
|
"advantages/std": 0.46760880947113037,
|
|
"advantages/var": 0.2186579986950079,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.203584229390681,
|
|
"grad_norm": 0.08696000030980688,
|
|
"learning_rate": 1.6509868278224124e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 65067351.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.11914245784282684,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 767
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.206451612903226,
|
|
"grad_norm": 0.059580949505776214,
|
|
"learning_rate": 1.6501322796092468e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 65137747.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 768
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.975157366213667e-09,
|
|
"advantages/std": 0.4675973355770111,
|
|
"advantages/var": 0.21864726823871994,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.2093189964157705,
|
|
"grad_norm": 0.12920441443676584,
|
|
"learning_rate": 1.6492769082619669e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 65222576.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.11100947856903076,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 769
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.053907636466336e-08,
|
|
"advantages/std": 0.618579626083374,
|
|
"advantages/var": 0.3826407538054468,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.2121863799283155,
|
|
"grad_norm": 0.17369470199564782,
|
|
"learning_rate": 1.64842071486356e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 65309214.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.2001592218875885,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 770
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344456541825744e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 2.21505376344086,
|
|
"grad_norm": 0.12310396305131238,
|
|
"learning_rate": 1.6475637004980545e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 65383541.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.12415501475334167,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 771
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.6262555319655851e-09,
|
|
"advantages/std": 0.5726791024208069,
|
|
"advantages/var": 0.327961354349501,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.217921146953405,
|
|
"grad_norm": 0.13788401845903062,
|
|
"learning_rate": 1.6467058662505193e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 65457069.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.15308690071105957,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 772
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 8.049402934764073e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.22078853046595,
|
|
"grad_norm": 0.08234956359655557,
|
|
"learning_rate": 1.6458472132070598e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 65542338.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 773
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.5056509608456243e-09,
|
|
"advantages/std": 0.6185514330863953,
|
|
"advantages/var": 0.3826058753732333,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.2236559139784946,
|
|
"grad_norm": 0.18533237301540315,
|
|
"learning_rate": 1.6449877424548191e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 65627174.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.1649293303489685,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 774
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.2998246708054356e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.2265232974910396,
|
|
"grad_norm": 0.14678834834025797,
|
|
"learning_rate": 1.6441274550819752e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 65718040.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 775
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876548503938182e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.229390681003584,
|
|
"grad_norm": 0.13079457903392905,
|
|
"learning_rate": 1.6432663521777398e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 65790063.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.09863808006048203,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 776
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.149905988405531e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.232258064516129,
|
|
"grad_norm": 0.07412815956067396,
|
|
"learning_rate": 1.642404434832358e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 65863602.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 777
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125942055767658e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.2351254480286737,
|
|
"grad_norm": 0.07015696451372215,
|
|
"learning_rate": 1.6415417041371052e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 65949550.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.12415502220392227,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 778
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125909557323754e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.2379928315412188,
|
|
"grad_norm": 0.08181718966797533,
|
|
"learning_rate": 1.640678161184287e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 66024603.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 779
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.4536081669351505e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.2408602150537633,
|
|
"grad_norm": 0.12480215557562097,
|
|
"learning_rate": 1.6398138070672372e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 66091063.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 780
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-09,
|
|
"advantages/snr": 1.9916359783736918e-08,
|
|
"advantages/std": 0.46761685609817505,
|
|
"advantages/var": 0.21866552410714135,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.2437275985663083,
|
|
"grad_norm": 0.0789649925885418,
|
|
"learning_rate": 1.638948642880317e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 66163966.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.1293872892856598,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 781
|
|
},
|
|
{
|
|
"advantages/mean": 5.122274160385132e-09,
|
|
"advantages/snr": 1.2649061754629257e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.246594982078853,
|
|
"grad_norm": 0.11120191454022281,
|
|
"learning_rate": 1.6380826697189126e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 66236604.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 782
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.958460418703728e-10,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.249462365591398,
|
|
"grad_norm": 0.10790084580548243,
|
|
"learning_rate": 1.6372158886794348e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 66311793.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 783
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 5.691850523553041e-09,
|
|
"advantages/std": 0.5726835131645203,
|
|
"advantages/var": 0.32796640625045725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.252329749103943,
|
|
"grad_norm": 0.11999164653187376,
|
|
"learning_rate": 1.6363483008593175e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 66394471.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 784
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.2551971326164875,
|
|
"grad_norm": 0.054776655410849015,
|
|
"learning_rate": 1.6354799073570158e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 66471747.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 785
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.258064516129032,
|
|
"grad_norm": 0.06885862272421579,
|
|
"learning_rate": 1.6346107092720047e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 66543060.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 786
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917811987622486e-09,
|
|
"advantages/std": 0.4675827622413635,
|
|
"advantages/var": 0.2186336395452635,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.260931899641577,
|
|
"grad_norm": 0.09426756429735036,
|
|
"learning_rate": 1.6337407077047783e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 66619983.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.09522313624620438,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 787
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450174584180815e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.263799283154122,
|
|
"grad_norm": 0.07220149653500607,
|
|
"learning_rate": 1.6328699037568477e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 66703084.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 788
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 8.049341918838717e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.2666666666666666,
|
|
"grad_norm": 0.07166662293294265,
|
|
"learning_rate": 1.6319982985307398e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 66781970.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 789
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 4.599467908273411e-09,
|
|
"advantages/std": 0.4049696922302246,
|
|
"advantages/var": 0.16400045162504284,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 2.2695340501792116,
|
|
"grad_norm": 0.0901087122796034,
|
|
"learning_rate": 1.6311258931299962e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 66871490.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.09916213154792786,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 790
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.294568831333326e-08,
|
|
"advantages/std": 0.46761488914489746,
|
|
"advantages/var": 0.21866368454999474,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.272401433691756,
|
|
"grad_norm": 0.168916034453994,
|
|
"learning_rate": 1.6302526886591718e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 66953245.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.12703317403793335,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 791
|
|
},
|
|
{
|
|
"advantages/mean": -9.778887033462524e-09,
|
|
"advantages/snr": 1.580873184003523e-08,
|
|
"advantages/std": 0.6185750365257263,
|
|
"advantages/var": 0.38263507581280365,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.275268817204301,
|
|
"grad_norm": 0.1588000831481918,
|
|
"learning_rate": 1.6293786862238331e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 67038538.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.192268505692482,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 792
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 6.971194650918909e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.2781362007168457,
|
|
"grad_norm": 0.11748585850412334,
|
|
"learning_rate": 1.6285038869305564e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 67114899.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 793
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.2810035842293908,
|
|
"grad_norm": 0.04012524798369861,
|
|
"learning_rate": 1.6276282918869273e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 67196480.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 794
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.0112830604957173e-09,
|
|
"advantages/std": 0.6185553073883057,
|
|
"advantages/var": 0.3826106682982413,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 2.2838709677419353,
|
|
"grad_norm": 0.1256492011230556,
|
|
"learning_rate": 1.6267519022015393e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 67282663.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.16834919154644012,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 795
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.2596798670633243e-09,
|
|
"advantages/std": 0.7393327355384827,
|
|
"advantages/var": 0.546612893838816,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.2867383512544803,
|
|
"grad_norm": 0.18596069133345994,
|
|
"learning_rate": 1.625874718983991e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 67373465.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.2698654532432556,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 796
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814432667740602e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.289605734767025,
|
|
"grad_norm": 0.13332228534824503,
|
|
"learning_rate": 1.6249967433448867e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 67453638.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 797
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983400669593257e-09,
|
|
"advantages/std": 0.4676017463207245,
|
|
"advantages/var": 0.21865139316219118,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.29247311827957,
|
|
"grad_norm": 0.10214177942000607,
|
|
"learning_rate": 1.6241179763958331e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 67522550.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.11336850374937057,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 798
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.252515803253119e-09,
|
|
"advantages/std": 0.57267826795578,
|
|
"advantages/var": 0.3279603985888322,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.295340501792115,
|
|
"grad_norm": 0.10592730340612073,
|
|
"learning_rate": 1.62323841924944e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 67618155.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.1552036553621292,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 799
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.987466533846137e-09,
|
|
"advantages/std": 0.46761488914489746,
|
|
"advantages/var": 0.21866368454999474,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.2982078853046595,
|
|
"grad_norm": 0.08604654935892782,
|
|
"learning_rate": 1.6223580730193166e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 67703705.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.12703317403793335,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 800
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016361261689808e-09,
|
|
"advantages/std": 0.5227997303009033,
|
|
"advantages/var": 0.27331955800269725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.3010752688172045,
|
|
"grad_norm": 0.11778978963362384,
|
|
"learning_rate": 1.6214769388200709e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 67777762.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.14806944131851196,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 801
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 4.8787427301770344e-09,
|
|
"advantages/std": 0.5726819038391113,
|
|
"advantages/var": 0.32796456298478915,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.303942652329749,
|
|
"grad_norm": 0.1344213731837575,
|
|
"learning_rate": 1.62059501776731e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 67857518.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.15756267309188843,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 802
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.5933270610653542e-08,
|
|
"advantages/std": 0.46761149168014526,
|
|
"advantages/var": 0.21866050715133056,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.306810035842294,
|
|
"grad_norm": 0.16060446403631687,
|
|
"learning_rate": 1.6197123109776358e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 67939276.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.12255740165710449,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 803
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5627630490351976e-09,
|
|
"advantages/std": 0.5228091478347778,
|
|
"advantages/var": 0.2733294050597266,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.3096774193548386,
|
|
"grad_norm": 0.08810243865681089,
|
|
"learning_rate": 1.6188288195686457e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 68025822.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.15831917524337769,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 804
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.5628597236829876e-09,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.3125448028673836,
|
|
"grad_norm": 0.14997348940396277,
|
|
"learning_rate": 1.6179445446589307e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 68106382.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 805
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.315412186379928,
|
|
"grad_norm": 0.020152598229408337,
|
|
"learning_rate": 1.617059487368073e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 68190285.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 806
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.505094572484785e-09,
|
|
"advantages/std": 0.5726727247238159,
|
|
"advantages/var": 0.32795404964259944,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.318279569892473,
|
|
"grad_norm": 0.14840773328111326,
|
|
"learning_rate": 1.616173648816646e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 68268330.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.1462521106004715,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 807
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7814513910737996e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.3211469534050178,
|
|
"grad_norm": 0.08673907543339605,
|
|
"learning_rate": 1.6152870301262125e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 68344121.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 808
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.9874667242452167e-09,
|
|
"advantages/std": 0.4676148593425751,
|
|
"advantages/var": 0.21866365667797627,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.3240143369175628,
|
|
"grad_norm": 0.10158509615616124,
|
|
"learning_rate": 1.6143996324193223e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 68437027.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.12703317403793335,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 809
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 2.3268817204301078,
|
|
"grad_norm": 0.09161765227577882,
|
|
"learning_rate": 1.613511456819512e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 68524081.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 810
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917813257124117e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.3297491039426523,
|
|
"grad_norm": 0.10019493776074073,
|
|
"learning_rate": 1.6126225044513029e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 68601585.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 811
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.781509278854418e-09,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.332616487455197,
|
|
"grad_norm": 0.10708769991937248,
|
|
"learning_rate": 1.6117327764401995e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 68683203.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 812
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33063647150993347,
|
|
"advantages/var": 0.10932047629253905,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.335483870967742,
|
|
"grad_norm": 0.05251976644698296,
|
|
"learning_rate": 1.6108422739126893e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 68752714.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 813
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917335935919526e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.703125,
|
|
"epoch": 2.338351254480287,
|
|
"grad_norm": 0.06283725007481095,
|
|
"learning_rate": 1.6099509979962393e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 68846524.0,
|
|
"reward": 0.5,
|
|
"reward_std": 0.1065337136387825,
|
|
"rewards/drgrpo_math_reward/mean": 0.5,
|
|
"rewards/drgrpo_math_reward/std": 0.5019646286964417,
|
|
"step": 814
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.3412186379928315,
|
|
"grad_norm": 0.05613515715843307,
|
|
"learning_rate": 1.6090589498192969e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 68929137.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 815
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.0570579543741361e-08,
|
|
"advantages/std": 0.5726835131645203,
|
|
"advantages/var": 0.32796640625045725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.3440860215053765,
|
|
"grad_norm": 0.1734403360847045,
|
|
"learning_rate": 1.6081661305112855e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 69003472.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 816
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199317639730369e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.346953405017921,
|
|
"grad_norm": 0.08192004340641214,
|
|
"learning_rate": 1.6072725412026065e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 69075836.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 817
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983384167481491e-09,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.349820788530466,
|
|
"grad_norm": 0.10030683878884093,
|
|
"learning_rate": 1.6063781830246355e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 69165604.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 818
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.3526881720430106,
|
|
"grad_norm": 0.14778584127973038,
|
|
"learning_rate": 1.6054830571097214e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 69239182.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 819
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.3555555555555556,
|
|
"grad_norm": 0.02702641973921866,
|
|
"learning_rate": 1.6045871645911859e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 69313409.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 820
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.958222414150722e-10,
|
|
"advantages/std": 0.4676148593425751,
|
|
"advantages/var": 0.21866365667797627,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.3584229390681,
|
|
"grad_norm": 0.10873077079893656,
|
|
"learning_rate": 1.6036905066033205e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 69392487.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.12703317403793335,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 821
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125813688641252e-09,
|
|
"advantages/std": 0.5227880477905273,
|
|
"advantages/var": 0.2733073429126307,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.361290322580645,
|
|
"grad_norm": 0.09043236914308421,
|
|
"learning_rate": 1.6027930842813857e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 69471670.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 822
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.065478038084407e-09,
|
|
"advantages/std": 0.5727017521858215,
|
|
"advantages/var": 0.32798729695671014,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.3641577060931898,
|
|
"grad_norm": 0.21648865050194732,
|
|
"learning_rate": 1.6018948987616105e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 69555442.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.18253791332244873,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 823
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 7.318086675313809e-09,
|
|
"advantages/std": 0.5726840496063232,
|
|
"advantages/var": 0.3279670206734977,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.3670250896057348,
|
|
"grad_norm": 0.15665828090321152,
|
|
"learning_rate": 1.6009959511811903e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 69635152.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.16097761690616608,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 824
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.62618106042536e-09,
|
|
"advantages/std": 0.5727053284645081,
|
|
"advantages/var": 0.32799139325164006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.3698924731182798,
|
|
"grad_norm": 0.17779721014436148,
|
|
"learning_rate": 1.6000962426782841e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 69723627.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.18489694595336914,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 825
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.394213515514367e-08,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.3727598566308243,
|
|
"grad_norm": 0.10454738836965104,
|
|
"learning_rate": 1.5991957743920157e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 69798447.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 826
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 7.528111170810656e-10,
|
|
"advantages/std": 0.6185632348060608,
|
|
"advantages/var": 0.3826204754537379,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.3756272401433693,
|
|
"grad_norm": 0.14756827033506406,
|
|
"learning_rate": 1.59829454746247e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 69874364.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.17859894037246704,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 827
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.4083979969389336e-09,
|
|
"advantages/std": 0.6612637639045715,
|
|
"advantages/var": 0.4372697654532409,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.378494623655914,
|
|
"grad_norm": 0.17218708550366563,
|
|
"learning_rate": 1.5973925630306928e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 69953901.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.1938612163066864,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 828
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975343977137235e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.381362007168459,
|
|
"grad_norm": 0.0739302193567203,
|
|
"learning_rate": 1.5964898222386886e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 70030525.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.09522313624620438,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 829
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599795920412534e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.3842293906810035,
|
|
"grad_norm": 0.1012210466032213,
|
|
"learning_rate": 1.5955863262294203e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 70111438.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 830
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 2.3870967741935485,
|
|
"grad_norm": 0.052153777357613056,
|
|
"learning_rate": 1.5946820761468058e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 70199376.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 831
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.389964157706093,
|
|
"grad_norm": 0.04994625248728887,
|
|
"learning_rate": 1.5937770731357189e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 70284417.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 832
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 8.944284405653326e-09,
|
|
"advantages/std": 0.5726868510246277,
|
|
"advantages/var": 0.3279702293365041,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.392831541218638,
|
|
"grad_norm": 0.08601123088625985,
|
|
"learning_rate": 1.5928713183419857e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 70368337.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.16545338928699493,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 833
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 6.298515617681381e-09,
|
|
"advantages/std": 0.7393190860748291,
|
|
"advantages/var": 0.5465927110345206,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.3956989247311826,
|
|
"grad_norm": 0.16615908431907986,
|
|
"learning_rate": 1.5919648129123854e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 70447518.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.24830512702465057,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 834
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.3985663082437276,
|
|
"grad_norm": 0.07375274258284313,
|
|
"learning_rate": 1.5910575579946462e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 70519602.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 835
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.9753097007876364e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.4014336917562726,
|
|
"grad_norm": 0.10647210029597416,
|
|
"learning_rate": 1.590149554737446e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 70590978.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 836
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962665216109293e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.404301075268817,
|
|
"grad_norm": 0.08731420444068373,
|
|
"learning_rate": 1.5892408042904097e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 70678189.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 837
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975166506069069e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.4071684587813618,
|
|
"grad_norm": 0.07912848658835807,
|
|
"learning_rate": 1.5883313078041092e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 70761137.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 838
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 1.0349462367204652e-08,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.4100358422939068,
|
|
"grad_norm": 0.07558171588476088,
|
|
"learning_rate": 1.5874210664300598e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 70840232.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 839
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 5.633261853378446e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.412903225806452,
|
|
"grad_norm": 0.0658563126895547,
|
|
"learning_rate": 1.5865100813207204e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 70918041.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 840
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7814306742559065e-09,
|
|
"advantages/std": 0.5227947235107422,
|
|
"advantages/var": 0.27331432293067337,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.4157706093189963,
|
|
"grad_norm": 0.09599558968393125,
|
|
"learning_rate": 1.585598353629492e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 71005770.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.14441713690757751,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 841
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.991736767199176e-09,
|
|
"advantages/std": 0.4675931930541992,
|
|
"advantages/var": 0.21864339419062162,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.4186379928315414,
|
|
"grad_norm": 0.09228833027736225,
|
|
"learning_rate": 1.5846858845107146e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 71095368.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 842
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199247907244247e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.421505376344086,
|
|
"grad_norm": 0.0888693255334843,
|
|
"learning_rate": 1.5837726751196678e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 71179062.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 843
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.4083785526452889e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.424372759856631,
|
|
"grad_norm": 0.08783138167701011,
|
|
"learning_rate": 1.582858726612569e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 71255646.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 844
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.4272401433691755,
|
|
"grad_norm": 0.04169677356932018,
|
|
"learning_rate": 1.58194404014657e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 71336082.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 845
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.13154848685047e-09,
|
|
"advantages/std": 0.5726600289344788,
|
|
"advantages/var": 0.32793950873923805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.4301075268817205,
|
|
"grad_norm": 0.17353269166094223,
|
|
"learning_rate": 1.581028616879758e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 71406830.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.13258251547813416,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 846
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.432974910394265,
|
|
"grad_norm": 0.11219877687026668,
|
|
"learning_rate": 1.5801124579711524e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 71478282.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 847
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 6.337738429595054e-09,
|
|
"advantages/std": 0.6612692475318909,
|
|
"advantages/var": 0.43727701773139316,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.43584229390681,
|
|
"grad_norm": 0.17166820647525638,
|
|
"learning_rate": 1.5791955645807047e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 71567252.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.20069602131843567,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 848
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.4387096774193546,
|
|
"grad_norm": 0.07494762996346009,
|
|
"learning_rate": 1.5782779378692954e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 71657155.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 849
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 5.7495735248314805e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.4415770609318996,
|
|
"grad_norm": 0.0867713706613162,
|
|
"learning_rate": 1.5773595789987347e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 71744493.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 850
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.5628702850968443e-09,
|
|
"advantages/std": 0.5227934122085571,
|
|
"advantages/var": 0.27331295184866633,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 2.4444444444444446,
|
|
"grad_norm": 0.08048893955625597,
|
|
"learning_rate": 1.5764404891317582e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 71832101.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.1422954797744751,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 851
|
|
},
|
|
{
|
|
"advantages/mean": 5.122274160385132e-09,
|
|
"advantages/snr": 9.79780726992476e-09,
|
|
"advantages/std": 0.5227980017662048,
|
|
"advantages/var": 0.2733177506507367,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.447311827956989,
|
|
"grad_norm": 0.1405487397602274,
|
|
"learning_rate": 1.5755206694320284e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 71913348.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.14571532607078552,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 852
|
|
},
|
|
{
|
|
"advantages/mean": 5.122274160385132e-09,
|
|
"advantages/snr": 1.09543064605741e-08,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.450179211469534,
|
|
"grad_norm": 0.10916833852927496,
|
|
"learning_rate": 1.5746001210641315e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 71995750.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 853
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.4530465949820788,
|
|
"grad_norm": 0.09746033420380783,
|
|
"learning_rate": 1.5736788451935761e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 72067809.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 854
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299897960206267e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.455913978494624,
|
|
"grad_norm": 0.07552149181874301,
|
|
"learning_rate": 1.572756842986791e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 72148314.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 855
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344336502847305e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.4587813620071683,
|
|
"grad_norm": 0.12037563104289,
|
|
"learning_rate": 1.5718341156111266e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 72218695.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 856
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.4616487455197134,
|
|
"grad_norm": 0.037175842241328985,
|
|
"learning_rate": 1.57091066423485e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 72304989.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 857
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975300560494157e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.464516129032258,
|
|
"grad_norm": 0.096446285201923,
|
|
"learning_rate": 1.5699864900271452e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 72391234.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 858
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.979230209351863e-09,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.467383512544803,
|
|
"grad_norm": 0.10407527655496832,
|
|
"learning_rate": 1.5690615941581116e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 72467982.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 859
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983467187183905e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.4702508960573475,
|
|
"grad_norm": 0.08420394392631414,
|
|
"learning_rate": 1.5681359777987616e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 72548772.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 860
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.4083459495941717e-09,
|
|
"advantages/std": 0.6612882018089294,
|
|
"advantages/var": 0.4373020858516874,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.4731182795698925,
|
|
"grad_norm": 0.13591374215815513,
|
|
"learning_rate": 1.5672096421210218e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 72638390.0,
|
|
"reward": 0.6171875,
|
|
"reward_std": 0.22567616403102875,
|
|
"rewards/drgrpo_math_reward/mean": 0.6171875,
|
|
"rewards/drgrpo_math_reward/std": 0.4879830479621887,
|
|
"step": 861
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.4759856630824375,
|
|
"grad_norm": 0.10533486860728158,
|
|
"learning_rate": 1.5662825882977267e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 72727295.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 862
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899474012416308e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.478853046594982,
|
|
"grad_norm": 0.07881267236540548,
|
|
"learning_rate": 1.5653548175026223e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 72808598.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 863
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.2649061754629257e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.481720430107527,
|
|
"grad_norm": 0.07840275481089848,
|
|
"learning_rate": 1.5644263309103612e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 72880030.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 864
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.1499403476539522e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.4845878136200716,
|
|
"grad_norm": 0.108450197441348,
|
|
"learning_rate": 1.5634971296965027e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 72959411.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 865
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 5.749445740229558e-09,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 2.4874551971326166,
|
|
"grad_norm": 0.07035031215141113,
|
|
"learning_rate": 1.562567215037511e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 73041173.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 866
|
|
},
|
|
{
|
|
"advantages/mean": -1.6298145055770874e-09,
|
|
"advantages/snr": 3.1175438442211334e-09,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.490322580645161,
|
|
"grad_norm": 0.12188140921287076,
|
|
"learning_rate": 1.5616365881107527e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 73132608.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 867
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.493189964157706,
|
|
"grad_norm": 0.08703212815397973,
|
|
"learning_rate": 1.5607052500944975e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 73211769.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 868
|
|
},
|
|
{
|
|
"advantages/mean": -7.916241884231567e-09,
|
|
"advantages/snr": 1.3822759121218127e-08,
|
|
"advantages/std": 0.5726962089538574,
|
|
"advantages/var": 0.3279809477501203,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.496057347670251,
|
|
"grad_norm": 0.093374326160837,
|
|
"learning_rate": 1.559773202167915e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 73298035.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.17358636856079102,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 869
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876049603820392e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.498924731182796,
|
|
"grad_norm": 0.07205080915872775,
|
|
"learning_rate": 1.5588404455110729e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 73380492.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 870
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.878766595896755e-09,
|
|
"advantages/std": 0.5726791024208069,
|
|
"advantages/var": 0.327961354349501,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.5017921146953404,
|
|
"grad_norm": 0.18283812648687808,
|
|
"learning_rate": 1.557906981304937e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 73459739.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.15308690071105957,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 871
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 6.5047553830684996e-09,
|
|
"advantages/std": 0.5727025866508484,
|
|
"advantages/var": 0.3279882527565725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 2.5046594982078854,
|
|
"grad_norm": 0.11614709433264811,
|
|
"learning_rate": 1.5569728107313682e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 73544814.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.1804211586713791,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 872
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.5075268817204304,
|
|
"grad_norm": 0.06259624112532391,
|
|
"learning_rate": 1.556037934973123e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 73623858.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 873
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.6721323019547286e-09,
|
|
"advantages/std": 0.5227974057197571,
|
|
"advantages/var": 0.2733171274273083,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.510394265232975,
|
|
"grad_norm": 0.0982949417266809,
|
|
"learning_rate": 1.5551023552138499e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 73705544.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.14465448260307312,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 874
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125782003796406e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.5132616487455195,
|
|
"grad_norm": 0.11004084520606709,
|
|
"learning_rate": 1.5541660726380884e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 73793032.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 875
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917668534980524e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.5161290322580645,
|
|
"grad_norm": 0.0929457191484818,
|
|
"learning_rate": 1.5532290884312677e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 73875683.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 876
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.2523743065903652e-09,
|
|
"advantages/std": 0.5727031826972961,
|
|
"advantages/var": 0.32798893547161256,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.5189964157706095,
|
|
"grad_norm": 0.13685780520987997,
|
|
"learning_rate": 1.552291403779707e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 73966745.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.1814819872379303,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 877
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.521863799283154,
|
|
"grad_norm": 0.07743984670080835,
|
|
"learning_rate": 1.5513530198706103e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 74043948.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 878
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.3360778801984135e-08,
|
|
"advantages/std": 0.5227928161621094,
|
|
"advantages/var": 0.2733123286307091,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.524731182795699,
|
|
"grad_norm": 0.1524430947405621,
|
|
"learning_rate": 1.5504139378920687e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 74119575.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.1412346363067627,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 879
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.6185494065284729,
|
|
"advantages/var": 0.38260336831672603,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.5275985663082436,
|
|
"grad_norm": 0.19977156677764607,
|
|
"learning_rate": 1.5494741590330552e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 74196680.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.16151440143585205,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 880
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.563018557708836e-09,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.5304659498207887,
|
|
"grad_norm": 0.10824165725563178,
|
|
"learning_rate": 1.5485336844834272e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 74280381.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 881
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.516905874828942e-09,
|
|
"advantages/std": 0.618557870388031,
|
|
"advantages/var": 0.38261383901897617,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.533333333333333,
|
|
"grad_norm": 0.13516385807945042,
|
|
"learning_rate": 1.5475925154339209e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 74362501.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.17282497882843018,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 882
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917668534980524e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.5362007168458782,
|
|
"grad_norm": 0.0878478830412105,
|
|
"learning_rate": 1.5466506530761535e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 74451427.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 883
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199317639730369e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.539068100358423,
|
|
"grad_norm": 0.09807090921005532,
|
|
"learning_rate": 1.5457080986026193e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 74541539.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 884
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.330655038356781,
|
|
"advantages/var": 0.10933275439072432,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 2.541935483870968,
|
|
"grad_norm": 0.09864711259132558,
|
|
"learning_rate": 1.5447648532066886e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 74630489.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 885
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 5.691787729288785e-09,
|
|
"advantages/std": 0.5726898312568665,
|
|
"advantages/var": 0.3279736428250182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.5448028673835124,
|
|
"grad_norm": 0.1495485160941214,
|
|
"learning_rate": 1.543820918082607e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 74708218.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.16675157845020294,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 886
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 5.749701315113695e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.5476702508960574,
|
|
"grad_norm": 0.0590678300248879,
|
|
"learning_rate": 1.5428762944254929e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 74784545.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 887
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.2348929130638486e-09,
|
|
"advantages/std": 0.522804319858551,
|
|
"advantages/var": 0.27332435686276213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.5505376344086024,
|
|
"grad_norm": 0.14348185161722257,
|
|
"learning_rate": 1.5419309834313366e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 74864179.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.1514892876148224,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 888
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.3444254652277355e-09,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.553405017921147,
|
|
"grad_norm": 0.09711082824675567,
|
|
"learning_rate": 1.5409849862969993e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 74956343.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 889
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.562954778661877e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.5562724014336915,
|
|
"grad_norm": 0.0991722264535798,
|
|
"learning_rate": 1.5400383042202094e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 75043580.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 890
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.3443230978138685e-09,
|
|
"advantages/std": 0.5227916836738586,
|
|
"advantages/var": 0.2733111445185479,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.5591397849462365,
|
|
"grad_norm": 0.1178474376865152,
|
|
"learning_rate": 1.5390909383995645e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 75136495.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.13994136452674866,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 891
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966544250856589e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.5620071684587815,
|
|
"grad_norm": 0.038278747152186696,
|
|
"learning_rate": 1.5381428900345264e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 75209995.0,
|
|
"reward": 0.9765625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.9765625,
|
|
"rewards/drgrpo_math_reward/std": 0.15188287198543549,
|
|
"step": 892
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.564874551971326,
|
|
"grad_norm": 0.08258340506919103,
|
|
"learning_rate": 1.5371941603254212e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 75275390.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 893
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975067111642947e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.567741935483871,
|
|
"grad_norm": 0.06255011662610364,
|
|
"learning_rate": 1.5362447504734386e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 75356967.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 894
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.022324708878079e-09,
|
|
"advantages/std": 0.6185801029205322,
|
|
"advantages/var": 0.38264134372917624,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.5706093189964156,
|
|
"grad_norm": 0.11484738745539673,
|
|
"learning_rate": 1.5352946616806284e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 75454518.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.2012200653553009,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 895
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983378074428632e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.5734767025089607,
|
|
"grad_norm": 0.11358406298400703,
|
|
"learning_rate": 1.5343438951499004e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 75527502.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 896
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998246708054356e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 2.576344086021505,
|
|
"grad_norm": 0.04788623486583116,
|
|
"learning_rate": 1.5333924520850226e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 75616569.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.08443662524223328,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 897
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 8.016503840289217e-09,
|
|
"advantages/std": 0.5227904319763184,
|
|
"advantages/var": 0.27330983576598555,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.5792114695340502,
|
|
"grad_norm": 0.10138377339107882,
|
|
"learning_rate": 1.5324403336906192e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 75688872.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 898
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380984365940094,
|
|
"advantages/var": 0.05466704299203351,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.5820788530465952,
|
|
"grad_norm": 0.040232547209283584,
|
|
"learning_rate": 1.5314875411721703e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 75760606.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 899
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.58494623655914,
|
|
"grad_norm": 0.06228123707556885,
|
|
"learning_rate": 1.5305340757360084e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 75835645.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 900
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599761052090956e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.5878136200716844,
|
|
"grad_norm": 0.06372713400930594,
|
|
"learning_rate": 1.5295799385893187e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 75908865.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 901
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 5.6335249592517076e-09,
|
|
"advantages/std": 0.6612716317176819,
|
|
"advantages/var": 0.4372801709145655,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.5906810035842294,
|
|
"grad_norm": 0.1520930595061047,
|
|
"learning_rate": 1.5286251309401367e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 75989610.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.20517179369926453,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 902
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 6.899641578136434e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.5935483870967744,
|
|
"grad_norm": 0.12045278274152169,
|
|
"learning_rate": 1.5276696539973463e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 76073826.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 903
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.330655038356781,
|
|
"advantages/var": 0.10933275439072432,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 2.596415770609319,
|
|
"grad_norm": 0.06414423443784778,
|
|
"learning_rate": 1.5267135089706799e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 76161573.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 904
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998806953079044e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.599283154121864,
|
|
"grad_norm": 0.0943382436274999,
|
|
"learning_rate": 1.5257566970707146e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 76243364.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 905
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199522104181912e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.6021505376344085,
|
|
"grad_norm": 0.09240194951046704,
|
|
"learning_rate": 1.5247992195088726e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 76322823.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 906
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.6050179211469535,
|
|
"grad_norm": 0.09991208834120584,
|
|
"learning_rate": 1.5238410774974186e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 76394083.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 907
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.5995024336639936e-09,
|
|
"advantages/std": 0.40496665239334106,
|
|
"advantages/var": 0.16399798955066913,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.607885304659498,
|
|
"grad_norm": 0.09315548607081912,
|
|
"learning_rate": 1.522882272249459e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 76469286.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.09574718773365021,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 908
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.610752688172043,
|
|
"grad_norm": 0.06860176298994038,
|
|
"learning_rate": 1.5219228049789385e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 76542018.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 909
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.967079601050182e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.6136200716845877,
|
|
"grad_norm": 0.07965350424581469,
|
|
"learning_rate": 1.5209626769006424e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 76628055.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 910
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344599133202926e-09,
|
|
"advantages/std": 0.5227646827697754,
|
|
"advantages/var": 0.2732829135513839,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.6164874551971327,
|
|
"grad_norm": 0.1287594496900276,
|
|
"learning_rate": 1.52000188923019e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 76698586.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.11048543453216553,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 911
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.2946214724693276e-08,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.6193548387096772,
|
|
"grad_norm": 0.09505569044083684,
|
|
"learning_rate": 1.5190404431840379e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 76778726.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 912
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299897960206267e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.6222222222222222,
|
|
"grad_norm": 0.0714478703207126,
|
|
"learning_rate": 1.5180783399794748e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 76863299.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 913
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.6262492693233955e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 2.6250896057347672,
|
|
"grad_norm": 0.11699639846045115,
|
|
"learning_rate": 1.5171155808346225e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 76953880.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 914
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 1.126665165824288e-08,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.627956989247312,
|
|
"grad_norm": 0.06128489390695928,
|
|
"learning_rate": 1.5161521669684324e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 77027011.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 915
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7814751550759118e-09,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.6308243727598564,
|
|
"grad_norm": 0.10874420590707522,
|
|
"learning_rate": 1.5151880996006849e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 77118787.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 916
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.2674839170101503e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.6336917562724014,
|
|
"grad_norm": 0.06940406161791886,
|
|
"learning_rate": 1.5142233799519888e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 77198046.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 917
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814855139419146e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.6365591397849464,
|
|
"grad_norm": 0.0862377687131543,
|
|
"learning_rate": 1.5132580092437776e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 77277263.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.12415502220392227,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 918
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.13111686481873e-09,
|
|
"advantages/std": 0.5726904273033142,
|
|
"advantages/var": 0.3279743255248526,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.639426523297491,
|
|
"grad_norm": 0.15485793207595325,
|
|
"learning_rate": 1.5122919886983101e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 77360134.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.16781240701675415,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 919
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.252321502378045e-09,
|
|
"advantages/std": 0.5727124810218811,
|
|
"advantages/var": 0.3279995859182385,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.642293906810036,
|
|
"grad_norm": 0.12812788734814362,
|
|
"learning_rate": 1.5113253195386669e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 77449426.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.1962026059627533,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 920
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.504893491854984e-09,
|
|
"advantages/std": 0.5726904273033142,
|
|
"advantages/var": 0.3279743255248526,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 2.6451612903225805,
|
|
"grad_norm": 0.16883266529636876,
|
|
"learning_rate": 1.5103580029887501e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 77528729.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.16781240701675415,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 921
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.1383744885263768e-08,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.6480286738351255,
|
|
"grad_norm": 0.13546924997345466,
|
|
"learning_rate": 1.5093900402732822e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 77614891.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 922
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907665222004876e-09,
|
|
"advantages/std": 0.5227646827697754,
|
|
"advantages/var": 0.2732829135513839,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.65089605734767,
|
|
"grad_norm": 0.15356635061355306,
|
|
"learning_rate": 1.508421432617803e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 77684846.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.11048543453216553,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 923
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814513910737996e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.653763440860215,
|
|
"grad_norm": 0.16045655005144885,
|
|
"learning_rate": 1.5074521812486686e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 77763563.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 924
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 7.528182986834278e-10,
|
|
"advantages/std": 0.618557333946228,
|
|
"advantages/var": 0.38261317537866546,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.65663082437276,
|
|
"grad_norm": 0.12945671903202308,
|
|
"learning_rate": 1.5064822873930514e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 77843577.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.17176413536071777,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 925
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.1255179743801086e-09,
|
|
"advantages/std": 0.5228097438812256,
|
|
"advantages/var": 0.2733300282971527,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.6594982078853047,
|
|
"grad_norm": 0.11566487045829575,
|
|
"learning_rate": 1.5055117522789359e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 77921983.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.1593799889087677,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 926
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299897960206267e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.6623655913978492,
|
|
"grad_norm": 0.0722680302433899,
|
|
"learning_rate": 1.5045405771351192e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 77991904.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 927
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.4937775360182155e-08,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 2.6652329749103942,
|
|
"grad_norm": 0.1048178773070161,
|
|
"learning_rate": 1.5035687631912088e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 78077359.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 928
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.757340237782477e-09,
|
|
"advantages/std": 0.5726904273033142,
|
|
"advantages/var": 0.3279743255248526,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.6681003584229392,
|
|
"grad_norm": 0.1607884752224577,
|
|
"learning_rate": 1.50259631167762e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 78150796.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.16781240701675415,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 929
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.126672697608316e-08,
|
|
"advantages/std": 0.6612905859947205,
|
|
"advantages/var": 0.4373052391252408,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.670967741935484,
|
|
"grad_norm": 0.14678061253887872,
|
|
"learning_rate": 1.5016232238255772e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 78237144.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.2301519513130188,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 930
|
|
},
|
|
{
|
|
"advantages/mean": 4.6566128730773926e-09,
|
|
"advantages/snr": 8.907107673182429e-09,
|
|
"advantages/std": 0.5227974057197571,
|
|
"advantages/var": 0.2733171274273083,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 2.673835125448029,
|
|
"grad_norm": 0.11465268297178344,
|
|
"learning_rate": 1.5006495008671088e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 78323828.0,
|
|
"reward": 0.6015625,
|
|
"reward_std": 0.14465448260307312,
|
|
"rewards/drgrpo_math_reward/mean": 0.6015625,
|
|
"rewards/drgrpo_math_reward/std": 0.4915000796318054,
|
|
"step": 931
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876049603820392e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.6767025089605734,
|
|
"grad_norm": 0.0856042022501944,
|
|
"learning_rate": 1.4996751440350477e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 78409881.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 932
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 2.6795698924731184,
|
|
"grad_norm": 0.10245524726972559,
|
|
"learning_rate": 1.4987001545630299e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 78508000.0,
|
|
"reward": 0.515625,
|
|
"reward_std": 0.1354655772447586,
|
|
"rewards/drgrpo_math_reward/mean": 0.515625,
|
|
"rewards/drgrpo_math_reward/std": 0.5017194747924805,
|
|
"step": 933
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 8.016397819821542e-09,
|
|
"advantages/std": 0.5227973461151123,
|
|
"advantages/var": 0.27331706510500453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.682437275985663,
|
|
"grad_norm": 0.09704250796406216,
|
|
"learning_rate": 1.4977245336854917e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 78579769.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.14465448260307312,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 934
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.1383871662376608e-08,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.685304659498208,
|
|
"grad_norm": 0.11486660686377306,
|
|
"learning_rate": 1.4967482826376697e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 78651612.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 935
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46757492423057556,
|
|
"advantages/var": 0.21862630976922848,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.688172043010753,
|
|
"grad_norm": 0.0874091285719257,
|
|
"learning_rate": 1.495771402655597e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 78735336.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 936
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.987588013390756e-09,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.6910394265232975,
|
|
"grad_norm": 0.10806235308978829,
|
|
"learning_rate": 1.4947938949761053e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 78800359.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 937
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.693906810035842,
|
|
"grad_norm": 0.09387755709107924,
|
|
"learning_rate": 1.4938157608368173e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 78874400.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 938
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.907268126346096e-10,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.696774193548387,
|
|
"grad_norm": 0.10134195018080254,
|
|
"learning_rate": 1.492837001476153e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 78965723.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 939
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.878670118891239e-09,
|
|
"advantages/std": 0.5726904273033142,
|
|
"advantages/var": 0.3279743255248526,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.699641577060932,
|
|
"grad_norm": 0.2235023923375762,
|
|
"learning_rate": 1.491857618133321e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 79045149.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.16781240701675415,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 940
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 5.749390312251308e-09,
|
|
"advantages/std": 0.4049657881259918,
|
|
"advantages/var": 0.1639972895525057,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.7025089605734767,
|
|
"grad_norm": 0.08963897075914033,
|
|
"learning_rate": 1.4908776120483218e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 79124361.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.094686359167099,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 941
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.252446745927492e-09,
|
|
"advantages/std": 0.5726904273033142,
|
|
"advantages/var": 0.3279743255248526,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 2.7053763440860212,
|
|
"grad_norm": 0.11407899853476111,
|
|
"learning_rate": 1.4898969844619425e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 79220477.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.16781240701675415,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 942
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 6.899267866969296e-09,
|
|
"advantages/std": 0.4049658179283142,
|
|
"advantages/var": 0.16399731369034853,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.7082437275985662,
|
|
"grad_norm": 0.09343083442825938,
|
|
"learning_rate": 1.4889157366157595e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 79298331.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.094686359167099,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 943
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958906628562059e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.7111111111111112,
|
|
"grad_norm": 0.12382972314155807,
|
|
"learning_rate": 1.487933869752132e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 79378092.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 944
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.713978494623656,
|
|
"grad_norm": 0.12254767675960122,
|
|
"learning_rate": 1.4869513851142049e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 79467881.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 945
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46757492423057556,
|
|
"advantages/var": 0.21862630976922848,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.716845878136201,
|
|
"grad_norm": 0.31136183346146684,
|
|
"learning_rate": 1.485968283945904e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 79544328.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 946
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.7197132616487454,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.4849845674919364e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 79622291.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 947
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962814881146116e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.7225806451612904,
|
|
"grad_norm": 0.07069162132403303,
|
|
"learning_rate": 1.4840002369977878e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 79698502.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 948
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.2524704417381583e-09,
|
|
"advantages/std": 0.5726862549781799,
|
|
"advantages/var": 0.3279695466409329,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 2.725448028673835,
|
|
"grad_norm": 0.13405538041277784,
|
|
"learning_rate": 1.4830152937097218e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 79788916.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.16439256072044373,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 949
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.72831541218638,
|
|
"grad_norm": 0.10862978629884379,
|
|
"learning_rate": 1.4820297388747771e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 79871575.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 950
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.5628747534061713e-09,
|
|
"advantages/std": 0.5227927565574646,
|
|
"advantages/var": 0.27331226630895245,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.731182795698925,
|
|
"grad_norm": 0.09485759290377778,
|
|
"learning_rate": 1.4810435737407677e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 79946601.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.1412346363067627,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 951
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.149940263022739e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.7340501792114695,
|
|
"grad_norm": 0.100642066311835,
|
|
"learning_rate": 1.480056799556279e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 80017444.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 952
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.736917562724014,
|
|
"grad_norm": 0.07338200894842473,
|
|
"learning_rate": 1.4790694175706695e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 80093621.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 953
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 2.739784946236559,
|
|
"grad_norm": 0.032849686700925966,
|
|
"learning_rate": 1.4780814290340649e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 80164645.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 954
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.3444321679928155e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.742652329749104,
|
|
"grad_norm": 0.1503840253544236,
|
|
"learning_rate": 1.4770928351973603e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 80246580.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.12756997346878052,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 955
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 8.962349336466492e-09,
|
|
"advantages/std": 0.4676175117492676,
|
|
"advantages/var": 0.2186661372945764,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.7455197132616487,
|
|
"grad_norm": 0.07668240362001452,
|
|
"learning_rate": 1.476103637312217e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 80323828.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.130448117852211,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 956
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.3444321679928155e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 2.7483870967741937,
|
|
"grad_norm": 0.10629153913011348,
|
|
"learning_rate": 1.475113836631061e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 80410044.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.12756997346878052,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 957
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.379887186086637e-08,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.7512544802867382,
|
|
"grad_norm": 0.06630528689284595,
|
|
"learning_rate": 1.474123434407081e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 80496741.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 958
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.7541218637992833,
|
|
"grad_norm": 0.10162467858428631,
|
|
"learning_rate": 1.4731324318942283e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 80571116.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 959
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.0689055673126508e-08,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.756989247311828,
|
|
"grad_norm": 0.09520805259728901,
|
|
"learning_rate": 1.4721408303472131e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 80664243.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 960
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 9.958849501312727e-10,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.759856630824373,
|
|
"grad_norm": 0.1069432224121283,
|
|
"learning_rate": 1.471148631021505e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 80742648.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.09863808006048203,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 961
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494468808174133,
|
|
"advantages/var": 0.16398020040561878,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.762724014336918,
|
|
"grad_norm": 0.07304136325985482,
|
|
"learning_rate": 1.4701558351733302e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 80819619.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 962
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983467187183905e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.7655913978494624,
|
|
"grad_norm": 0.076909777483124,
|
|
"learning_rate": 1.4691624440596696e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 80912320.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 963
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.768458781362007,
|
|
"grad_norm": 0.08451304598371802,
|
|
"learning_rate": 1.468168458938258e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 80992896.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 964
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 7.527936351672609e-10,
|
|
"advantages/std": 0.6185775995254517,
|
|
"advantages/var": 0.38263824663467005,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.771326164874552,
|
|
"grad_norm": 0.19101846437724104,
|
|
"learning_rate": 1.4671738810675836e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 81078065.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.19674429297447205,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 965
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.774193548387097,
|
|
"grad_norm": 0.07835465741994906,
|
|
"learning_rate": 1.4661787117068825e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 81151463.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 966
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 8.049325332145131e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.7770609318996415,
|
|
"grad_norm": 0.05479424124139151,
|
|
"learning_rate": 1.4651829521161424e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 81234515.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 967
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917813257124117e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.7799283154121865,
|
|
"grad_norm": 0.16588886508677797,
|
|
"learning_rate": 1.4641866035560959e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 81305873.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 968
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.967257841033182e-09,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.782795698924731,
|
|
"grad_norm": 0.1158578530989859,
|
|
"learning_rate": 1.4631896672882234e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 81385432.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 969
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958501673983143e-09,
|
|
"advantages/std": 0.4676017463207245,
|
|
"advantages/var": 0.21865139316219118,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.785663082437276,
|
|
"grad_norm": 0.10413897332282132,
|
|
"learning_rate": 1.4621921445747477e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 81462375.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.11336850374937057,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 970
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.404969722032547,
|
|
"advantages/var": 0.16400047576311838,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.7885304659498207,
|
|
"grad_norm": 0.08097482759759493,
|
|
"learning_rate": 1.461194036678635e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 81533072.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.09916213154792786,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 971
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 5.691639042734874e-09,
|
|
"advantages/std": 0.5727047920227051,
|
|
"advantages/var": 0.3279907788057699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.7913978494623657,
|
|
"grad_norm": 0.14788883072954456,
|
|
"learning_rate": 1.4601953448635927e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 81615351.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.18383610248565674,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 972
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907256955369e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.7942652329749103,
|
|
"grad_norm": 0.22342003375054953,
|
|
"learning_rate": 1.459196070394066e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 81693213.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 973
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450174584180815e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.7971326164874553,
|
|
"grad_norm": 0.12356303934435144,
|
|
"learning_rate": 1.4581962145352402e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 81764294.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 974
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.9792225930357895e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 2.8,
|
|
"grad_norm": 0.12701479463293694,
|
|
"learning_rate": 1.457195778553034e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 81849035.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.11678344756364822,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 975
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.2470159737516598e-08,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.802867383512545,
|
|
"grad_norm": 0.13362216416825526,
|
|
"learning_rate": 1.4561947637141029e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 81923931.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 976
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.50560656552406e-09,
|
|
"advantages/std": 0.6185696721076965,
|
|
"advantages/var": 0.3826284392514232,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 2.80573476702509,
|
|
"grad_norm": 0.11692498741580577,
|
|
"learning_rate": 1.4551931712858331e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 82009690.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.18649455904960632,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 977
|
|
},
|
|
{
|
|
"advantages/mean": 6.984919309616089e-09,
|
|
"advantages/snr": 1.1291831627915751e-08,
|
|
"advantages/std": 0.6185815930366516,
|
|
"advantages/var": 0.3826431872437617,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.8086021505376344,
|
|
"grad_norm": 0.1406608507322815,
|
|
"learning_rate": 1.454191002536345e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 82088904.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.20357416570186615,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 978
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.811469534050179,
|
|
"grad_norm": 0.09124679518816567,
|
|
"learning_rate": 1.4531882587344857e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 82178261.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 979
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.814336917562724,
|
|
"grad_norm": 0.06809812851547395,
|
|
"learning_rate": 1.4521849411498318e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 82251057.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 980
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.817204301075269,
|
|
"grad_norm": 0.1073867350067464,
|
|
"learning_rate": 1.4511810510526867e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 82329285.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 981
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-09,
|
|
"advantages/snr": 1.3278335481571027e-08,
|
|
"advantages/std": 0.7013850212097168,
|
|
"advantages/var": 0.4919409479773549,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.8200716845878135,
|
|
"grad_norm": 0.16588912014450993,
|
|
"learning_rate": 1.4501765897140778e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 82417959.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.22962789237499237,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 982
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.8229390681003586,
|
|
"grad_norm": 0.052627823166171435,
|
|
"learning_rate": 1.449171558405756e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 82503360.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 983
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 9.798125641320161e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 2.825806451612903,
|
|
"grad_norm": 0.12406921859703168,
|
|
"learning_rate": 1.4481659584001946e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 82592272.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.12756997346878052,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 984
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.258355009955157e-09,
|
|
"advantages/std": 0.6185846924781799,
|
|
"advantages/var": 0.38264702176832444,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.828673835125448,
|
|
"grad_norm": 0.14853677123319015,
|
|
"learning_rate": 1.4471597909705855e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 82690245.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.20911076664924622,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 985
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.970882751882131e-09,
|
|
"advantages/std": 0.4676063358783722,
|
|
"advantages/var": 0.21865568535359703,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 2.8315412186379927,
|
|
"grad_norm": 0.07254601773018031,
|
|
"learning_rate": 1.4461530573908406e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 82782718.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.11913755536079407,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 986
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 5.633244590331673e-09,
|
|
"advantages/std": 0.33065226674079895,
|
|
"advantages/var": 0.10933092150082846,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.8344086021505377,
|
|
"grad_norm": 0.09383271684892873,
|
|
"learning_rate": 1.4451457589355872e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 82864889.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.06233953312039375,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 987
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.0349462367204652e-08,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.8372759856630827,
|
|
"grad_norm": 0.14252668262964652,
|
|
"learning_rate": 1.4441378968801686e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 82943320.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 988
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 1.126665165824288e-08,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.8401433691756273,
|
|
"grad_norm": 0.08642639406893778,
|
|
"learning_rate": 1.4431294725006413e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 83024227.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 989
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.974942206861495e-09,
|
|
"advantages/std": 0.46761417388916016,
|
|
"advantages/var": 0.2186630156220417,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.843010752688172,
|
|
"grad_norm": 0.08548485932771985,
|
|
"learning_rate": 1.4421204870737745e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 83114552.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.12597234547138214,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 990
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814432667740602e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.845878136200717,
|
|
"grad_norm": 0.10774296462925989,
|
|
"learning_rate": 1.4411109418770465e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 83196364.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.13888053596019745,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 991
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.848745519713262,
|
|
"grad_norm": 0.05479591059822382,
|
|
"learning_rate": 1.4401008381886457e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 83282399.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 992
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.258368719288218e-09,
|
|
"advantages/std": 0.6185809373855591,
|
|
"advantages/var": 0.38264237609679697,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 2.8516129032258064,
|
|
"grad_norm": 0.09920848222282774,
|
|
"learning_rate": 1.4390901772874666e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 83371633.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.19910329580307007,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 993
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.8544802867383514,
|
|
"grad_norm": 0.08555965379344638,
|
|
"learning_rate": 1.4380789604531094e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 83459958.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.12863080203533173,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 994
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.2469952164166843e-08,
|
|
"advantages/std": 0.5227973461151123,
|
|
"advantages/var": 0.27331706510500453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.857347670250896,
|
|
"grad_norm": 0.08960529128256936,
|
|
"learning_rate": 1.4370671889658782e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 83543626.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.14465448260307312,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 995
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125942055767658e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.860215053763441,
|
|
"grad_norm": 0.09127515038097096,
|
|
"learning_rate": 1.4360548641067798e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 83618205.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.12415501475334167,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 996
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983628920516591e-09,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.8630824372759855,
|
|
"grad_norm": 0.11549919848538252,
|
|
"learning_rate": 1.4350419871575208e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 83691289.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 997
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380985856056213,
|
|
"advantages/var": 0.05466704996011007,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 2.8659498207885306,
|
|
"grad_norm": 0.08600165163589032,
|
|
"learning_rate": 1.4340285594005078e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 83774046.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 998
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.868817204301075,
|
|
"grad_norm": 0.0902222187264063,
|
|
"learning_rate": 1.4330145821188434e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 83854337.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 999
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 2.87168458781362,
|
|
"grad_norm": 0.13998648226573715,
|
|
"learning_rate": 1.432000056596328e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 83933985.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.8745519713261647,
|
|
"grad_norm": 0.05841281416678034,
|
|
"learning_rate": 1.4309849841174535e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 84005555.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1001
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.130963692329808e-10,
|
|
"advantages/std": 0.5727012157440186,
|
|
"advantages/var": 0.3279866825146769,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.8774193548387097,
|
|
"grad_norm": 0.134757766435047,
|
|
"learning_rate": 1.429969365967407e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 84088581.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.18147708475589752,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1002
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983495876754114e-09,
|
|
"advantages/std": 0.4675905704498291,
|
|
"advantages/var": 0.2186409415735966,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 2.8802867383512547,
|
|
"grad_norm": 0.10776129736819205,
|
|
"learning_rate": 1.4289532034320647e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 84165395.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1003
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.906988860324416e-09,
|
|
"advantages/std": 0.5228043794631958,
|
|
"advantages/var": 0.2733244191858972,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.8831541218637993,
|
|
"grad_norm": 0.12360334360957083,
|
|
"learning_rate": 1.427936497797992e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 84253316.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.1514892876148224,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 1004
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.3361080419982039e-08,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.886021505376344,
|
|
"grad_norm": 0.11279355138455799,
|
|
"learning_rate": 1.4269192503524432e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 84345585.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.888888888888889,
|
|
"grad_norm": 0.062167831208651166,
|
|
"learning_rate": 1.4259014623833576e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 84425396.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1006
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.891756272401434,
|
|
"grad_norm": 0.07904456306252947,
|
|
"learning_rate": 1.424883135179359e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 84503969.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1007
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.516924590743576e-09,
|
|
"advantages/std": 0.6185553073883057,
|
|
"advantages/var": 0.3826106682982413,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.8946236559139784,
|
|
"grad_norm": 0.15491260783383373,
|
|
"learning_rate": 1.4238642700297544e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 84591781.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.16834920644760132,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1008
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.8974910394265234,
|
|
"grad_norm": 0.0905853947441121,
|
|
"learning_rate": 1.422844868224531e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 84670631.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1009
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 9.958849501312727e-10,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.900358422939068,
|
|
"grad_norm": 0.10623597528855593,
|
|
"learning_rate": 1.4218249310543562e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 84751482.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 2.903225806451613,
|
|
"grad_norm": 0.08308100916550797,
|
|
"learning_rate": 1.4208044598105754e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 84838729.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1011
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.9060931899641576,
|
|
"grad_norm": 0.06735697884255029,
|
|
"learning_rate": 1.419783455785209e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 84917736.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1012
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.9089605734767026,
|
|
"grad_norm": 0.10682262758743971,
|
|
"learning_rate": 1.4187619202709536e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 84999114.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1013
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 5.6333472398789395e-09,
|
|
"advantages/std": 0.6612924933433533,
|
|
"advantages/var": 0.43730776175226893,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 2.9118279569892476,
|
|
"grad_norm": 0.10684262770251247,
|
|
"learning_rate": 1.4177398545611775e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 85096639.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.23356688022613525,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 1014
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016755193120049e-09,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.914695340501792,
|
|
"grad_norm": 0.09541058678290444,
|
|
"learning_rate": 1.4167172599499207e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 85176369.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344456541825744e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.9175627240143367,
|
|
"grad_norm": 0.09885274980236455,
|
|
"learning_rate": 1.4156941377318927e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 85255259.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.12415501475334167,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1016
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.504875212414157e-09,
|
|
"advantages/std": 0.5726920366287231,
|
|
"advantages/var": 0.32797616881795477,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.9204301075268817,
|
|
"grad_norm": 0.13566379630410266,
|
|
"learning_rate": 1.4146704892024711e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 85335500.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.1701665222644806,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1017
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.97930573908082e-09,
|
|
"advantages/std": 0.4675965905189514,
|
|
"advantages/var": 0.21864657146494793,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 2.9232974910394267,
|
|
"grad_norm": 0.08449164175546724,
|
|
"learning_rate": 1.4136463156577004e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 85420546.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1018
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.149914704966296e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.9261648745519713,
|
|
"grad_norm": 0.110435170845307,
|
|
"learning_rate": 1.4126216183942886e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 85501354.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1019
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 5.749358156051495e-09,
|
|
"advantages/std": 0.4049680531024933,
|
|
"advantages/var": 0.16399912403362382,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.9290322580645163,
|
|
"grad_norm": 0.09421270821046172,
|
|
"learning_rate": 1.4115963987096078e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 85593918.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.09704046696424484,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.958294131658464e-10,
|
|
"advantages/std": 0.46761149168014526,
|
|
"advantages/var": 0.21866050715133056,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.931899641577061,
|
|
"grad_norm": 0.0775638917687058,
|
|
"learning_rate": 1.4105706579016914e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 85687755.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.12255740165710449,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 1021
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.2946288161655503e-08,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.934767025089606,
|
|
"grad_norm": 0.11807849656225168,
|
|
"learning_rate": 1.409544397269232e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 85761830.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.1054728701710701,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1022
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344282883117038e-09,
|
|
"advantages/std": 0.5227956175804138,
|
|
"advantages/var": 0.2733152577612863,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.9376344086021504,
|
|
"grad_norm": 0.11236698481614323,
|
|
"learning_rate": 1.408517618111581e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 85849256.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.14230036735534668,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1023
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.264868475059852e-08,
|
|
"advantages/std": 0.40496495366096497,
|
|
"advantages/var": 0.1639966136936275,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 2.9405017921146954,
|
|
"grad_norm": 0.1800267764243477,
|
|
"learning_rate": 1.407490321728746e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 85934211.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.0936255231499672,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 1024
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 2.94336917562724,
|
|
"grad_norm": 0.07265991277831217,
|
|
"learning_rate": 1.4064625094213898e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 86003296.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.946236559139785,
|
|
"grad_norm": 0.13200183927295472,
|
|
"learning_rate": 1.405434182490828e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 86073842.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1026
|
|
},
|
|
{
|
|
"advantages/mean": 4.6566128730773926e-09,
|
|
"advantages/snr": 9.958667967959763e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 2.9491039426523296,
|
|
"grad_norm": 0.11495201119172228,
|
|
"learning_rate": 1.4044053422390278e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 86154117.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1027
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.516696093096275e-09,
|
|
"advantages/std": 0.6185865998268127,
|
|
"advantages/var": 0.38264938148529737,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 2.9519713261648746,
|
|
"grad_norm": 0.10893709159700979,
|
|
"learning_rate": 1.4033759899686061e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 86240235.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.20911568403244019,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1028
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4676129221916199,
|
|
"advantages/var": 0.21866184500058594,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.9548387096774196,
|
|
"grad_norm": 0.09579100145553239,
|
|
"learning_rate": 1.4023461269828296e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 86325908.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.1246790662407875,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1029
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983629174425397e-09,
|
|
"advantages/std": 0.46757492423057556,
|
|
"advantages/var": 0.21862630976922848,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.957706093189964,
|
|
"grad_norm": 0.08513116523147185,
|
|
"learning_rate": 1.401315754585609e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 86414814.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262171105692396e-09,
|
|
"advantages/std": 0.5726926326751709,
|
|
"advantages/var": 0.3279768515204182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 2.9605734767025087,
|
|
"grad_norm": 0.1262348988136174,
|
|
"learning_rate": 1.400284874081502e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 86503399.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.1712273508310318,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1031
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.9634408602150537,
|
|
"grad_norm": 0.08103645772255394,
|
|
"learning_rate": 1.3992534867757089e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 86564084.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1032
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633244590331673e-09,
|
|
"advantages/std": 0.33065226674079895,
|
|
"advantages/var": 0.10933092150082846,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.9663082437275987,
|
|
"grad_norm": 0.0849378340052373,
|
|
"learning_rate": 1.3982215939740725e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 86648887.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.06233953312039375,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 1033
|
|
},
|
|
{
|
|
"advantages/mean": -8.381903171539307e-09,
|
|
"advantages/snr": 1.463608042839594e-08,
|
|
"advantages/std": 0.5726876854896545,
|
|
"advantages/var": 0.3279711851114975,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.9691756272401433,
|
|
"grad_norm": 0.281596752146975,
|
|
"learning_rate": 1.3971891969830733e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 86729674.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.1633366346359253,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1034
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.9720430107526883,
|
|
"grad_norm": 0.11168309540429934,
|
|
"learning_rate": 1.3961562971098335e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 86811377.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.6185722351074219,
|
|
"advantages/var": 0.3826316100457916,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 2.974910394265233,
|
|
"grad_norm": 0.14349750402415082,
|
|
"learning_rate": 1.395122895662109e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 86898169.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.19097033143043518,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 1036
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 2.977777777777778,
|
|
"grad_norm": 0.0785080644075257,
|
|
"learning_rate": 1.3940889939482923e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 86978019.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1037
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.9806451612903224,
|
|
"grad_norm": 0.1306098130087868,
|
|
"learning_rate": 1.3930545932774092e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 87049853.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1038
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 1.0349153895649778e-08,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.9835125448028674,
|
|
"grad_norm": 0.09788399616171604,
|
|
"learning_rate": 1.3920196949591166e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 87135503.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1039
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.131077936456911e-10,
|
|
"advantages/std": 0.5726931691169739,
|
|
"advantages/var": 0.32797746595324284,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 2.9863799283154124,
|
|
"grad_norm": 0.1527075830682524,
|
|
"learning_rate": 1.390984300303702e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 87222915.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.172288179397583,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.989247311827957,
|
|
"grad_norm": 0.046094532672897426,
|
|
"learning_rate": 1.3899484106220814e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 87297903.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1041
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46757492423057556,
|
|
"advantages/var": 0.21862630976922848,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 2.9921146953405016,
|
|
"grad_norm": 0.0983977964273471,
|
|
"learning_rate": 1.388912027225797e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 87371974.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1042
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.2998806953079044e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 2.9949820788530466,
|
|
"grad_norm": 0.10504310759419465,
|
|
"learning_rate": 1.3878751514270169e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 87445745.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1043
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917480653229804e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 2.9978494623655916,
|
|
"grad_norm": 0.08956076137213259,
|
|
"learning_rate": 1.3868377845385317e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 87531820.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1044
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46760615706443787,
|
|
"advantages/var": 0.21865551812457173,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.002867383512545,
|
|
"grad_norm": 0.11278050956106807,
|
|
"learning_rate": 1.3857999278737545e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 87611902.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.1157275140285492,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633244590331673e-09,
|
|
"advantages/std": 0.33065226674079895,
|
|
"advantages/var": 0.10933092150082846,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.0057347670250896,
|
|
"grad_norm": 0.07363958021084817,
|
|
"learning_rate": 1.384761582746718e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 87678237.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.06233953312039375,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 1046
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 3.5209261236260495e-09,
|
|
"advantages/std": 0.6612766981124878,
|
|
"advantages/var": 0.4372868714665543,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 3.0086021505376346,
|
|
"grad_norm": 0.15511405904834488,
|
|
"learning_rate": 1.3837227504720736e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 87766887.0,
|
|
"reward": 0.578125,
|
|
"reward_std": 0.2109457552433014,
|
|
"rewards/drgrpo_math_reward/mean": 0.578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4957992732524872,
|
|
"step": 1047
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5726862549781799,
|
|
"advantages/var": 0.3279695466409329,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.011469534050179,
|
|
"grad_norm": 0.14518322064472808,
|
|
"learning_rate": 1.3826834323650898e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 87846837.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.16439256072044373,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1048
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227956175804138,
|
|
"advantages/var": 0.2733152577612863,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 3.014336917562724,
|
|
"grad_norm": 0.09863025981033159,
|
|
"learning_rate": 1.3816436297416494e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 87928695.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.14230038225650787,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1049
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299846843318419e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.0172043010752687,
|
|
"grad_norm": 0.10991296190658557,
|
|
"learning_rate": 1.3806033439182497e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 88007220.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.0200716845878137,
|
|
"grad_norm": 0.08240359629638405,
|
|
"learning_rate": 1.3795625762119985e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 88091728.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 1051
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.0349462367204652e-08,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.0229390681003583,
|
|
"grad_norm": 0.08775335038983857,
|
|
"learning_rate": 1.3785213279406146e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 88167242.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1052
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.0258064516129033,
|
|
"grad_norm": 0.12907689906371417,
|
|
"learning_rate": 1.3774796004224256e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 88234112.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1053
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.149914704966296e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.028673835125448,
|
|
"grad_norm": 0.09909875372545414,
|
|
"learning_rate": 1.3764373949763645e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 88304660.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1054
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262188031035393e-09,
|
|
"advantages/std": 0.5726920366287231,
|
|
"advantages/var": 0.32797616881795477,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.031541218637993,
|
|
"grad_norm": 0.1472668433116852,
|
|
"learning_rate": 1.375394712921971e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 88400732.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.1701665222644806,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958574030221743e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.0344086021505374,
|
|
"grad_norm": 0.11660139742796921,
|
|
"learning_rate": 1.374351555579387e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 88469780.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1056
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983533706996105e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.0372759856630824,
|
|
"grad_norm": 0.10045295452108793,
|
|
"learning_rate": 1.3733079242693571e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 88540617.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1057
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.0401433691756274,
|
|
"grad_norm": 0.10774423597192603,
|
|
"learning_rate": 1.372263820313225e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 88619796.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1058
|
|
},
|
|
{
|
|
"advantages/mean": 5.587935447692871e-09,
|
|
"advantages/snr": 1.379866977655094e-08,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.043010752688172,
|
|
"grad_norm": 0.14459464886811796,
|
|
"learning_rate": 1.3712192450329336e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 88704086.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1059
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.453528449034464e-09,
|
|
"advantages/std": 0.5228003859519958,
|
|
"advantages/var": 0.2733202435515558,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.045878136200717,
|
|
"grad_norm": 0.20995250081848818,
|
|
"learning_rate": 1.3701741997510221e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 88782665.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.14913025498390198,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 7.96706741399221e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.0487455197132616,
|
|
"grad_norm": 0.12442638943183558,
|
|
"learning_rate": 1.3691286857906251e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 88854072.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1061
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.0112256078484365e-09,
|
|
"advantages/std": 0.6185671091079712,
|
|
"advantages/var": 0.38262526847019274,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.0516129032258066,
|
|
"grad_norm": 0.14621710227022958,
|
|
"learning_rate": 1.3680827044754707e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 88945302.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.18201878666877747,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1062
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.453534034272115e-09,
|
|
"advantages/std": 0.5227997303009033,
|
|
"advantages/var": 0.27331955800269725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.054480286738351,
|
|
"grad_norm": 0.10101354256707841,
|
|
"learning_rate": 1.367036257129878e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 89035504.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.14806944131851196,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1063
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299846843318419e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.057347670250896,
|
|
"grad_norm": 0.09216983462994441,
|
|
"learning_rate": 1.3659893450787573e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 89116768.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1064
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876548503938182e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.0602150537634407,
|
|
"grad_norm": 0.07296259186673848,
|
|
"learning_rate": 1.3649419696476055e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 89193429.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.0630824372759857,
|
|
"grad_norm": 0.1040375514130535,
|
|
"learning_rate": 1.3638941321625084e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 89261258.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1066
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.0659498207885303,
|
|
"grad_norm": 0.09775719772800988,
|
|
"learning_rate": 1.3628458339501347e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 89356079.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1067
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 3.0688172043010753,
|
|
"grad_norm": 0.09456317741154854,
|
|
"learning_rate": 1.3617970763377383e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 89453113.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1068
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.453592933994712e-09,
|
|
"advantages/std": 0.5227928161621094,
|
|
"advantages/var": 0.2733123286307091,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.07168458781362,
|
|
"grad_norm": 0.10740696477457821,
|
|
"learning_rate": 1.360747860653153e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 89537226.0,
|
|
"reward": 0.5390625,
|
|
"reward_std": 0.1412346363067627,
|
|
"rewards/drgrpo_math_reward/mean": 0.5390625,
|
|
"rewards/drgrpo_math_reward/std": 0.5004304051399231,
|
|
"step": 1069
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983473280509385e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 3.074551971326165,
|
|
"grad_norm": 0.10686438889895704,
|
|
"learning_rate": 1.3596981882247942e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 89619330.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.07741935483871,
|
|
"grad_norm": 0.09464056194453326,
|
|
"learning_rate": 1.3586480603816543e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 89704066.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 1071
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.0802867383512544,
|
|
"grad_norm": 0.18192909878470326,
|
|
"learning_rate": 1.3575974784533031e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 89778195.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.12415502220392227,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1072
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.9750579720916185e-09,
|
|
"advantages/std": 0.4676051139831543,
|
|
"advantages/var": 0.21865454262319872,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.0831541218637994,
|
|
"grad_norm": 0.1056231633941552,
|
|
"learning_rate": 1.3565464437698848e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 89856346.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.11784426867961884,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1073
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299880526045478e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.086021505376344,
|
|
"grad_norm": 0.12922868766375104,
|
|
"learning_rate": 1.355494957662117e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 89940852.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1074
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.088888888888889,
|
|
"grad_norm": 0.15690728990169092,
|
|
"learning_rate": 1.3544430214612895e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 90018486.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975244195968941e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.0917562724014336,
|
|
"grad_norm": 0.14509465934671859,
|
|
"learning_rate": 1.3533906364992604e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 90090651.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 1076
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966859224177393e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.0946236559139786,
|
|
"grad_norm": 0.11225193068731186,
|
|
"learning_rate": 1.3523378041084574e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 90171019.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 1077
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.097491039426523,
|
|
"grad_norm": 0.08430816486137278,
|
|
"learning_rate": 1.3512845256218746e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 90244863.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1078
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.5056365973668555e-09,
|
|
"advantages/std": 0.618557333946228,
|
|
"advantages/var": 0.38261317537866546,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.100358422939068,
|
|
"grad_norm": 0.13897323270534442,
|
|
"learning_rate": 1.35023080237307e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 90322473.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.17176413536071777,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1079
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.6185835003852844,
|
|
"advantages/var": 0.3826455469489112,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 3.1032258064516127,
|
|
"grad_norm": 0.1296860171128417,
|
|
"learning_rate": 1.3491766356961658e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 90413848.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.20357906818389893,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.516924590743576e-09,
|
|
"advantages/std": 0.6185553073883057,
|
|
"advantages/var": 0.3826106682982413,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.1060931899641577,
|
|
"grad_norm": 0.09957232647714506,
|
|
"learning_rate": 1.3481220269258446e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 90508557.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.16834920644760132,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1081
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.975176026781512e-09,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 3.1089605734767023,
|
|
"grad_norm": 0.11632736235577498,
|
|
"learning_rate": 1.3470669773973495e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 90595707.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1082
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.1118279569892473,
|
|
"grad_norm": 0.08457504067488755,
|
|
"learning_rate": 1.3460114884464813e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 90658904.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1083
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344385248990318e-09,
|
|
"advantages/std": 0.5227856040000916,
|
|
"advantages/var": 0.27330478774974054,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.1146953405017923,
|
|
"grad_norm": 0.13503806139774452,
|
|
"learning_rate": 1.3449555614095968e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 90732573.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.13098980486392975,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1084
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599796258942519e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.117562724014337,
|
|
"grad_norm": 0.08829586881255871,
|
|
"learning_rate": 1.3438991976236084e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 90806880.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.120430107526882,
|
|
"grad_norm": 0.1341238536550148,
|
|
"learning_rate": 1.342842398425981e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 90881116.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1086
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.0954289704678737e-08,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.1232974910394264,
|
|
"grad_norm": 0.09064624961768537,
|
|
"learning_rate": 1.3417851651547306e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 90971531.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.11678344756364822,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1087
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.907386946654693e-10,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.1261648745519715,
|
|
"grad_norm": 0.09238775805363784,
|
|
"learning_rate": 1.3407274991484222e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 91064414.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1088
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.967079601050182e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.129032258064516,
|
|
"grad_norm": 0.09329067748840777,
|
|
"learning_rate": 1.3396694017461707e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 91148087.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 1089
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599521727490371e-09,
|
|
"advantages/std": 0.40496495366096497,
|
|
"advantages/var": 0.1639966136936275,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.131899641577061,
|
|
"grad_norm": 0.10497977844609316,
|
|
"learning_rate": 1.3386108742876349e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 91211425.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.0936255231499672,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.6262555319655851e-09,
|
|
"advantages/std": 0.5726791024208069,
|
|
"advantages/var": 0.327961354349501,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.1347670250896056,
|
|
"grad_norm": 0.1294893154667107,
|
|
"learning_rate": 1.3375519181130192e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 91290971.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.15308690071105957,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 1091
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 3.1376344086021506,
|
|
"grad_norm": 0.07557647834198548,
|
|
"learning_rate": 1.3364925345630711e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 91375027.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1092
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.8788021410185465e-09,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.140501792114695,
|
|
"grad_norm": 0.1400886797653509,
|
|
"learning_rate": 1.3354327249790785e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 91462378.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1093
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 7.527881223669866e-09,
|
|
"advantages/std": 0.6185821294784546,
|
|
"advantages/var": 0.38264385091009956,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.14336917562724,
|
|
"grad_norm": 0.12259425546705846,
|
|
"learning_rate": 1.334372490702869e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 91549766.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.20463500916957855,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1094
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.146236559139785,
|
|
"grad_norm": 0.04587102497418084,
|
|
"learning_rate": 1.3333118330768082e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 91642091.0,
|
|
"reward": 0.421875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.421875,
|
|
"rewards/drgrpo_math_reward/std": 0.4957992732524872,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.3278481048164687e-09,
|
|
"advantages/std": 0.7013773322105408,
|
|
"advantages/var": 0.4919301621387753,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.1491039426523297,
|
|
"grad_norm": 0.16406640602715863,
|
|
"learning_rate": 1.3322507534437963e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 91731679.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.2214949131011963,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1096
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 3.1519713261648747,
|
|
"grad_norm": 0.12206538089462148,
|
|
"learning_rate": 1.3311892531472704e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 91804150.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1097
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983311559814009e-09,
|
|
"advantages/std": 0.46761220693588257,
|
|
"advantages/var": 0.21866117607544666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.1548387096774193,
|
|
"grad_norm": 0.09001265590862512,
|
|
"learning_rate": 1.3301273335311976e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 91888615.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.1236182302236557,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 1098
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199093551714649e-09,
|
|
"advantages/std": 0.4049627482891083,
|
|
"advantages/var": 0.16399482750186767,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.1577060931899643,
|
|
"grad_norm": 0.19512990430006444,
|
|
"learning_rate": 1.3290649959400775e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 91964406.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.09127141535282135,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1099
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966888674758758e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.160573476702509,
|
|
"grad_norm": 0.08814638579810388,
|
|
"learning_rate": 1.328002241718938e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 92050716.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.163440860215054,
|
|
"grad_norm": 0.09837497675644805,
|
|
"learning_rate": 1.3269390722133356e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 92125083.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1101
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.1663082437275984,
|
|
"grad_norm": 0.04586394637505574,
|
|
"learning_rate": 1.325875488769351e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 92198984.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1102
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 9.199298683221743e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.1691756272401435,
|
|
"grad_norm": 0.06400873024854346,
|
|
"learning_rate": 1.3248114927335906e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 92275691.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1103
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.1292059034235823e-08,
|
|
"advantages/std": 0.6185691356658936,
|
|
"advantages/var": 0.3826277755984506,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.172043010752688,
|
|
"grad_norm": 0.1423659523351042,
|
|
"learning_rate": 1.3237470854531823e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 92359565.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.18543371558189392,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1104
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 8.907386946654693e-10,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.174910394265233,
|
|
"grad_norm": 0.13585745299902,
|
|
"learning_rate": 1.3226822682757743e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 92437625.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983629174425397e-09,
|
|
"advantages/std": 0.46757492423057556,
|
|
"advantages/var": 0.21862630976922848,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.1777777777777776,
|
|
"grad_norm": 0.11995484554528156,
|
|
"learning_rate": 1.321617042549535e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 92511972.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1106
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 9.95844518607158e-10,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.1806451612903226,
|
|
"grad_norm": 0.14138202171271355,
|
|
"learning_rate": 1.320551409623149e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 92591448.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1107
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.4083929189798527e-09,
|
|
"advantages/std": 0.6612661480903625,
|
|
"advantages/var": 0.4372729186102653,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.183512544802867,
|
|
"grad_norm": 17.339898670927273,
|
|
"learning_rate": 1.319485370845817e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 92675998.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.19833700358867645,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1108
|
|
},
|
|
{
|
|
"advantages/mean": 6.51925802230835e-09,
|
|
"advantages/snr": 1.3941923669503344e-08,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.186379928315412,
|
|
"grad_norm": 0.11099719938959011,
|
|
"learning_rate": 1.318418927567253e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 92755810.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1109
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299811976811062e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.189247311827957,
|
|
"grad_norm": 0.06748169995525603,
|
|
"learning_rate": 1.3173520811376842e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 92845212.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.2946288161655503e-08,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.1921146953405017,
|
|
"grad_norm": 0.1729202957432041,
|
|
"learning_rate": 1.3162848329078468e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 92931747.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1111
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125764130418491e-09,
|
|
"advantages/std": 0.5227916836738586,
|
|
"advantages/var": 0.2733111445185479,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 3.1949820788530467,
|
|
"grad_norm": 0.11420128502253798,
|
|
"learning_rate": 1.3152171842289869e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 93015564.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.13994136452674866,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1112
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.0349462367204652e-08,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.1978494623655913,
|
|
"grad_norm": 0.10964878776919543,
|
|
"learning_rate": 1.3141491364528575e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 93105876.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1113
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.2007168458781363,
|
|
"grad_norm": 0.09936713674235224,
|
|
"learning_rate": 1.3130806909317155e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 93178808.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1114
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.203584229390681,
|
|
"grad_norm": 0.08818255744001925,
|
|
"learning_rate": 1.3120118490183236e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 93260861.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983473280509385e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.206451612903226,
|
|
"grad_norm": 0.1029823451744844,
|
|
"learning_rate": 1.310942612065945e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 93339232.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1116
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599796258942519e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.2093189964157705,
|
|
"grad_norm": 0.10239693579696584,
|
|
"learning_rate": 1.3098729814283425e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 93414681.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1117
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344503462080032e-09,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.2121863799283155,
|
|
"grad_norm": 0.11581062256087042,
|
|
"learning_rate": 1.308802958459779e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 93495167.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1118
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7814536252692192e-09,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.21505376344086,
|
|
"grad_norm": 0.14209019240601692,
|
|
"learning_rate": 1.307732544515013e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 93573944.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1119
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633244590331673e-09,
|
|
"advantages/std": 0.33065226674079895,
|
|
"advantages/var": 0.10933092150082846,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.217921146953405,
|
|
"grad_norm": 0.0678371309109891,
|
|
"learning_rate": 1.3066617409492982e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 93644153.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.06233953312039375,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975343977137235e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.22078853046595,
|
|
"grad_norm": 0.11297932577816154,
|
|
"learning_rate": 1.3055905491183821e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 93719671.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1121
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907427569709573e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.2236559139784946,
|
|
"grad_norm": 0.14507820856849132,
|
|
"learning_rate": 1.3045189703785023e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 93804920.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.12415502220392227,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 1122
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.344496759136123e-09,
|
|
"advantages/std": 0.5227746963500977,
|
|
"advantages/var": 0.2732933831439368,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 3.2265232974910396,
|
|
"grad_norm": 0.08312605561045118,
|
|
"learning_rate": 1.3034470060863888e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 93876784.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.12179599702358246,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1123
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.229390681003584,
|
|
"grad_norm": 0.10769709358908111,
|
|
"learning_rate": 1.302374657599257e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 93949699.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1124
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.232258064516129,
|
|
"grad_norm": 0.05790644165104894,
|
|
"learning_rate": 1.301301926274811e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 94019155.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299802498719973e-09,
|
|
"advantages/std": 0.4049576222896576,
|
|
"advantages/var": 0.16399067585049298,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.2351254480286737,
|
|
"grad_norm": 0.1089893979447905,
|
|
"learning_rate": 1.300228813471238e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 94096690.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.08891239762306213,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1126
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 1.126665165824288e-08,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.2379928315412188,
|
|
"grad_norm": 0.05358052660575321,
|
|
"learning_rate": 1.299155320547209e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 94172287.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1127
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 4.599694025151775e-09,
|
|
"advantages/std": 0.40494978427886963,
|
|
"advantages/var": 0.16398432778750305,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.2408602150537633,
|
|
"grad_norm": 0.08993277827412993,
|
|
"learning_rate": 1.2980814488618763e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 94254562.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1128
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.2437275985663083,
|
|
"grad_norm": 0.03741420481122981,
|
|
"learning_rate": 1.297007199774871e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 94322524.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1129
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 3.246594982078853,
|
|
"grad_norm": 0.09576357623917453,
|
|
"learning_rate": 1.2959325746463035e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 94402604.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 3.449667444137735e-09,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 3.249462365591398,
|
|
"grad_norm": 0.05155173182885348,
|
|
"learning_rate": 1.2948575748367584e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 94489118.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1131
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.9834009234729675e-09,
|
|
"advantages/std": 0.4676017165184021,
|
|
"advantages/var": 0.21865136529095608,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.252329749103943,
|
|
"grad_norm": 0.11833142962661158,
|
|
"learning_rate": 1.2937822017072964e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 94565852.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.11336849629878998,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1132
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.2551971326164875,
|
|
"grad_norm": 0.1250304564086963,
|
|
"learning_rate": 1.2927064566194492e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 94645859.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1133
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.258064516129032,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.2916303409352214e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 94732887.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1134
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.63344972278721e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.260931899641577,
|
|
"grad_norm": 0.0983279558017045,
|
|
"learning_rate": 1.2905538560170852e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 94814521.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907665222004876e-09,
|
|
"advantages/std": 0.5227646827697754,
|
|
"advantages/var": 0.2732829135513839,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 3.263799283154122,
|
|
"grad_norm": 0.10809583758934845,
|
|
"learning_rate": 1.289477003227981e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 94906213.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.11048543453216553,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1136
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.2666666666666666,
|
|
"grad_norm": 0.0384556429318533,
|
|
"learning_rate": 1.288399783931315e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 94978971.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1137
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983371727518391e-09,
|
|
"advantages/std": 0.4676051437854767,
|
|
"advantages/var": 0.21865457049463632,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.2695340501792116,
|
|
"grad_norm": 0.12000069202772719,
|
|
"learning_rate": 1.287322199490957e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 95062894.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.11784426867961884,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1138
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958849501312727e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.272401433691756,
|
|
"grad_norm": 0.11544441417144902,
|
|
"learning_rate": 1.2862442512712392e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 95142033.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.09863808006048203,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1139
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.2998806953079044e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.275268817204301,
|
|
"grad_norm": 0.12380391127673529,
|
|
"learning_rate": 1.2851659406369551e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 95213204.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1499234216592094e-08,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.2781362007168457,
|
|
"grad_norm": 0.16194218053487433,
|
|
"learning_rate": 1.284087268953356e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 95295811.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1141
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380985856056213,
|
|
"advantages/var": 0.05466704996011007,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.2810035842293908,
|
|
"grad_norm": 0.05095881919374362,
|
|
"learning_rate": 1.2830082375861512e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 95373379.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1142
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5228040218353271,
|
|
"advantages/var": 0.2733240452471932,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 3.2838709677419353,
|
|
"grad_norm": 0.12303248397130558,
|
|
"learning_rate": 1.2819288479015047e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 95462214.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.15466687083244324,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1143
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962665787340388e-09,
|
|
"advantages/std": 0.4676010012626648,
|
|
"advantages/var": 0.21865069638184664,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.2867383512544803,
|
|
"grad_norm": 0.09910456100563501,
|
|
"learning_rate": 1.280849101266035e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 95539641.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1144
|
|
},
|
|
{
|
|
"advantages/mean": 5.587935447692871e-09,
|
|
"advantages/snr": 1.1950152502444473e-08,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.289605734767025,
|
|
"grad_norm": 0.08736872419534301,
|
|
"learning_rate": 1.2797689990468112e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 95621771.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975144418133046e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.29247311827957,
|
|
"grad_norm": 0.07349690490924235,
|
|
"learning_rate": 1.2786885426113544e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 95708411.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1146
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.295340501792115,
|
|
"grad_norm": 0.09362213075565909,
|
|
"learning_rate": 1.2776077333276324e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 95787152.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 1147
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.2982078853046595,
|
|
"grad_norm": 0.03775176697675935,
|
|
"learning_rate": 1.276526572564061e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 95871995.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 1148
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.022521435078091e-09,
|
|
"advantages/std": 0.6185598969459534,
|
|
"advantages/var": 0.38261634610978845,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.3010752688172045,
|
|
"grad_norm": 0.17528959621414597,
|
|
"learning_rate": 1.2754450616895005e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 95954564.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.17623992264270782,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1149
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4049657881259918,
|
|
"advantages/var": 0.1639972895525057,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 3.303942652329749,
|
|
"grad_norm": 0.05405794177356671,
|
|
"learning_rate": 1.2743632020732548e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 96041684.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.0946863517165184,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299846843318419e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.306810035842294,
|
|
"grad_norm": 0.0745366325775834,
|
|
"learning_rate": 1.2732809950850683e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 96114505.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1151
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.3096774193548386,
|
|
"grad_norm": 0.10078408790459324,
|
|
"learning_rate": 1.2721984420951268e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 96190659.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1152
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.3125448028673836,
|
|
"grad_norm": 0.07386438095189551,
|
|
"learning_rate": 1.2711155444740526e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 96261026.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1153
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.315412186379928,
|
|
"grad_norm": 0.04510258925922734,
|
|
"learning_rate": 1.2700323035929062e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 96337551.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1154
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633244590331673e-09,
|
|
"advantages/std": 0.33065226674079895,
|
|
"advantages/var": 0.10933092150082846,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.318279569892473,
|
|
"grad_norm": 0.05165231576261721,
|
|
"learning_rate": 1.2689487208231805e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 96414069.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.06233953312039375,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.562938529588136e-09,
|
|
"advantages/std": 0.5227833986282349,
|
|
"advantages/var": 0.2733024818812879,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.3211469534050178,
|
|
"grad_norm": 0.1420159424855486,
|
|
"learning_rate": 1.267864797536803e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 96490338.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.13098490238189697,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 1156
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.958683201273463e-10,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.3240143369175628,
|
|
"grad_norm": 0.12135563806747647,
|
|
"learning_rate": 1.2667805351061312e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 96570241.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 1157
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1498692618242365e-08,
|
|
"advantages/std": 0.40496888756752014,
|
|
"advantages/var": 0.16399979989767477,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.3268817204301078,
|
|
"grad_norm": 0.10155317067010135,
|
|
"learning_rate": 1.265695934903953e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 96654085.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.09810129553079605,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1158
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.3297491039426523,
|
|
"grad_norm": 0.057359457438611275,
|
|
"learning_rate": 1.2646109983034832e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 96736103.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 1159
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.332616487455197,
|
|
"grad_norm": 0.10192389899377412,
|
|
"learning_rate": 1.263525726678363e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 96826847.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.4497441148988883e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.335483870967742,
|
|
"grad_norm": 0.09375847790139395,
|
|
"learning_rate": 1.2624401214026572e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 96904825.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1161
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.75738695226396e-09,
|
|
"advantages/std": 0.5726876854896545,
|
|
"advantages/var": 0.3279711851114975,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.338351254480287,
|
|
"grad_norm": 0.12074444773595139,
|
|
"learning_rate": 1.2613541838508535e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 96982233.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.1633366346359253,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1162
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.504965933612274e-09,
|
|
"advantages/std": 0.5726840496063232,
|
|
"advantages/var": 0.3279670206734977,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 3.3412186379928315,
|
|
"grad_norm": 0.10123460756262612,
|
|
"learning_rate": 1.26026791539786e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 97068903.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.16097761690616608,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1163
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.3440860215053765,
|
|
"grad_norm": 0.06230410181062095,
|
|
"learning_rate": 1.2591813174190044e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 97154759.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1164
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.346953405017921,
|
|
"grad_norm": 0.06771623444997626,
|
|
"learning_rate": 1.2580943912900308e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 97228764.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262475767256781e-09,
|
|
"advantages/std": 0.5726819038391113,
|
|
"advantages/var": 0.32796456298478915,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 3.349820788530466,
|
|
"grad_norm": 0.1288512683886992,
|
|
"learning_rate": 1.2570071383870988e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 97324635.0,
|
|
"reward": 0.5859375,
|
|
"reward_std": 0.15756267309188843,
|
|
"rewards/drgrpo_math_reward/mean": 0.5859375,
|
|
"rewards/drgrpo_math_reward/std": 0.49449479579925537,
|
|
"step": 1166
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.6185553073883057,
|
|
"advantages/var": 0.3826106682982413,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.3526881720430106,
|
|
"grad_norm": 0.1359847136820415,
|
|
"learning_rate": 1.255919560086783e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 97405487.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.16834917664527893,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1167
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983628920516591e-09,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.3555555555555556,
|
|
"grad_norm": 0.08508583268728266,
|
|
"learning_rate": 1.2548316577660676e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 97478621.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1168
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.8788747562343056e-09,
|
|
"advantages/std": 0.5726664066314697,
|
|
"advantages/var": 0.32794681328419983,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.3584229390681,
|
|
"grad_norm": 0.13029888196956377,
|
|
"learning_rate": 1.25374343280235e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 97555214.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.13941732048988342,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1169
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.9835626514248234e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.361290322580645,
|
|
"grad_norm": 0.14925387660351838,
|
|
"learning_rate": 1.2526548865734334e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 97644243.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.2649061754629257e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.3641577060931898,
|
|
"grad_norm": 0.16881070944340243,
|
|
"learning_rate": 1.2515660204575295e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 97713405.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1171
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.3670250896057348,
|
|
"grad_norm": 0.07447657569529018,
|
|
"learning_rate": 1.2504768358332543e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 97794436.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1172
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.3698924731182798,
|
|
"grad_norm": 0.11325414232721744,
|
|
"learning_rate": 1.249387334079627e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 97877806.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1173
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.967257841033182e-09,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.3727598566308243,
|
|
"grad_norm": 0.09120860078900074,
|
|
"learning_rate": 1.2482975165760687e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 97955036.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1174
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975343596286746e-09,
|
|
"advantages/std": 0.4675827622413635,
|
|
"advantages/var": 0.2186336395452635,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.3756272401433693,
|
|
"grad_norm": 0.10773030424525183,
|
|
"learning_rate": 1.2472073847024004e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 98022372.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.09522313624620438,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344456541825744e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.378494623655914,
|
|
"grad_norm": 0.15865280731250242,
|
|
"learning_rate": 1.24611693983884e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 98095593.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.12415501475334167,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1176
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199591840825068e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.381362007168459,
|
|
"grad_norm": 0.08123097491556498,
|
|
"learning_rate": 1.245026183366003e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 98170363.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1177
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.8788209293635885e-09,
|
|
"advantages/std": 0.5726727247238159,
|
|
"advantages/var": 0.32795404964259944,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 3.3842293906810035,
|
|
"grad_norm": 0.11612838837917279,
|
|
"learning_rate": 1.2439351166648992e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 98258497.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.1462521106004715,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1178
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917366402546925e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.3870967741935485,
|
|
"grad_norm": 0.13617074164091159,
|
|
"learning_rate": 1.2428437411169303e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 98343751.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1179
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.389964157706093,
|
|
"grad_norm": 0.07805308280038463,
|
|
"learning_rate": 1.2417520581038901e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 98420827.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983628920516591e-09,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.392831541218638,
|
|
"grad_norm": 0.12308896762162003,
|
|
"learning_rate": 1.2406600690079608e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 98503170.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1181
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.3956989247311826,
|
|
"grad_norm": 0.08901455185337721,
|
|
"learning_rate": 1.2395677752117126e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 98582303.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 1182
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 8.131055087374661e-10,
|
|
"advantages/std": 0.5726947784423828,
|
|
"advantages/var": 0.32797930925516994,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.3985663082437276,
|
|
"grad_norm": 0.1257490696194946,
|
|
"learning_rate": 1.2384751780981017e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 98670905.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.17464229464530945,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 1183
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.5050695213580615e-09,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.4014336917562726,
|
|
"grad_norm": 0.1286335029069769,
|
|
"learning_rate": 1.2373822790504681e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 98755275.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1184
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4049627184867859,
|
|
"advantages/var": 0.1639948033642078,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.404301075268817,
|
|
"grad_norm": 0.08057067854185818,
|
|
"learning_rate": 1.236289079452534e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 98838254.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.09127141535282135,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.4071684587813618,
|
|
"grad_norm": 0.08605115730986439,
|
|
"learning_rate": 1.2351955806884014e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 98914452.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1186
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.4100358422939068,
|
|
"grad_norm": 0.026348135341604267,
|
|
"learning_rate": 1.234101784142553e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 98990483.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1187
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.5629027821475993e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.412903225806452,
|
|
"grad_norm": 0.07819569401189805,
|
|
"learning_rate": 1.2330076911998463e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 99083265.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1188
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.4157706093189963,
|
|
"grad_norm": 0.09006369062795917,
|
|
"learning_rate": 1.2319133032455162e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 99155919.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1189
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958683201273463e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.4186379928315414,
|
|
"grad_norm": 0.10137794543828527,
|
|
"learning_rate": 1.230818621665169e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 99241101.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.421505376344086,
|
|
"grad_norm": 0.04979362374814982,
|
|
"learning_rate": 1.2297236478447845e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 99325116.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1191
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.1950488391937882e-08,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.424372759856631,
|
|
"grad_norm": 0.11878324664587483,
|
|
"learning_rate": 1.2286283831707114e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 99396918.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1192
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7815011540266774e-09,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.4272401433691755,
|
|
"grad_norm": 0.12176590117312114,
|
|
"learning_rate": 1.2275328290296676e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 99477109.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1193
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 6.899627360122966e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.4301075268817205,
|
|
"grad_norm": 0.12056498969551692,
|
|
"learning_rate": 1.2264369868087364e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 99551171.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1194
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.1498780624502616e-09,
|
|
"advantages/std": 0.4049657881259918,
|
|
"advantages/var": 0.1639972895525057,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.432974910394265,
|
|
"grad_norm": 0.12740750311776367,
|
|
"learning_rate": 1.2253408578953666e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 99630765.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.094686359167099,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199247907244247e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.43584229390681,
|
|
"grad_norm": 0.08831406153890661,
|
|
"learning_rate": 1.2242444436773695e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 99711000.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1196
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.1499403476539522e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.4387096774193546,
|
|
"grad_norm": 11.03990611559521,
|
|
"learning_rate": 1.2231477455429185e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 99784870.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1197
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016541313711486e-09,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 3.4415770609318996,
|
|
"grad_norm": 0.10249676734220825,
|
|
"learning_rate": 1.2220507648805454e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 99867366.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 1198
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907386946654693e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.4444444444444446,
|
|
"grad_norm": 0.14083166742954242,
|
|
"learning_rate": 1.2209535030791402e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 99957869.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1199
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.562923093105361e-09,
|
|
"advantages/std": 0.5227856636047363,
|
|
"advantages/var": 0.27330485007064453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.447311827956989,
|
|
"grad_norm": 0.1092072729801531,
|
|
"learning_rate": 1.219855961527949e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 100034471.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.13098981976509094,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5726664066314697,
|
|
"advantages/var": 0.32794681328419983,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.450179211469534,
|
|
"grad_norm": 0.09284326648205744,
|
|
"learning_rate": 1.218758141616572e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 100119571.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.13941732048988342,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1201
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 9.958516906788102e-10,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.4530465949820788,
|
|
"grad_norm": 0.09633772102145596,
|
|
"learning_rate": 1.2176600447349615e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 100201105.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1202
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.455913978494624,
|
|
"grad_norm": 0.06799586748471112,
|
|
"learning_rate": 1.216561672273421e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 100291427.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1203
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.5629027821475993e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.4587813620071683,
|
|
"grad_norm": 0.12981349922544916,
|
|
"learning_rate": 1.2154630256226021e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 100375437.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1204
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.3009849269685278e-08,
|
|
"advantages/std": 0.5726876854896545,
|
|
"advantages/var": 0.3279711851114975,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.4616487455197134,
|
|
"grad_norm": 0.14342295542203048,
|
|
"learning_rate": 1.2143641061735045e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 100463182.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.1633366346359253,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.464516129032258,
|
|
"grad_norm": 0.11517897259066424,
|
|
"learning_rate": 1.2132649153174732e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 100549042.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1206
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.96693437436781e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.467383512544803,
|
|
"grad_norm": 0.09397773898658479,
|
|
"learning_rate": 1.2121654544461958e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 100638467.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1207
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.4251693997307513e-08,
|
|
"advantages/std": 0.5227856040000916,
|
|
"advantages/var": 0.27330478774974054,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.4702508960573475,
|
|
"grad_norm": 0.1519680821743502,
|
|
"learning_rate": 1.2110657249517028e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 100732338.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.13098981976509094,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1208
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.439337344384284e-09,
|
|
"advantages/std": 0.5726898908615112,
|
|
"advantages/var": 0.32797371109496964,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.4731182795698925,
|
|
"grad_norm": 0.10906049011686104,
|
|
"learning_rate": 1.209965728226365e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 100816224.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.16675156354904175,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 1209
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.4759856630824375,
|
|
"grad_norm": 0.1486381160895392,
|
|
"learning_rate": 1.2088654656628898e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 100894492.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.8996420859237135e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.478853046594982,
|
|
"grad_norm": 0.1259585465878865,
|
|
"learning_rate": 1.2077649386543236e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 100968092.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1211
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.481720430107527,
|
|
"grad_norm": 0.14289506144752373,
|
|
"learning_rate": 1.2066641485940456e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 101048864.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 1212
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899540529955257e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.4845878136200716,
|
|
"grad_norm": 0.08574260029696501,
|
|
"learning_rate": 1.2055630968757695e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 101115472.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1213
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966813525430481e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.4874551971326166,
|
|
"grad_norm": 0.1146376448735616,
|
|
"learning_rate": 1.2044617848935392e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 101190115.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1214
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 3.449710856633628e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.490322580645161,
|
|
"grad_norm": 0.08397441447009728,
|
|
"learning_rate": 1.2033602140417287e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 101259449.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.493189964157706,
|
|
"grad_norm": 0.12488189256210573,
|
|
"learning_rate": 1.2022583857150396e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 101338805.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1216
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962764040172268e-09,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.496057347670251,
|
|
"grad_norm": 0.15252039222909725,
|
|
"learning_rate": 1.2011563013084996e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 101418877.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1217
|
|
},
|
|
{
|
|
"advantages/mean": 5.122274160385132e-09,
|
|
"advantages/snr": 8.944094503700182e-09,
|
|
"advantages/std": 0.5726990103721619,
|
|
"advantages/var": 0.32798415648125356,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.498924731182796,
|
|
"grad_norm": 0.14327275724747895,
|
|
"learning_rate": 1.2000539622174607e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 101493230.0,
|
|
"reward": 0.6015625,
|
|
"reward_std": 0.17806214094161987,
|
|
"rewards/drgrpo_math_reward/mean": 0.6015625,
|
|
"rewards/drgrpo_math_reward/std": 0.4915000796318054,
|
|
"step": 1218
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.5017921146953404,
|
|
"grad_norm": 0.07537604610157368,
|
|
"learning_rate": 1.1989513698375965e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 101576438.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1219
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.0655021560480435e-09,
|
|
"advantages/std": 0.5726983547210693,
|
|
"advantages/var": 0.32798340550021976,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.5046594982078854,
|
|
"grad_norm": 0.13579145137213305,
|
|
"learning_rate": 1.1978485255649032e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 101659133.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.17700131237506866,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.5049341132024195e-09,
|
|
"advantages/std": 0.5726868510246277,
|
|
"advantages/var": 0.3279702293365041,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.5075268817204304,
|
|
"grad_norm": 0.12683903413547704,
|
|
"learning_rate": 1.1967454307956932e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 101737395.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.16545340418815613,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1221
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975166886896984e-09,
|
|
"advantages/std": 0.4675965905189514,
|
|
"advantages/var": 0.21864657146494793,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.510394265232975,
|
|
"grad_norm": 0.09319276000313177,
|
|
"learning_rate": 1.195642086926599e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 101813461.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1222
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.96694656101877e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.5132616487455195,
|
|
"grad_norm": 0.10925697498476222,
|
|
"learning_rate": 1.1945384953545658e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 101882370.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1223
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.958610843448447e-10,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.5161290322580645,
|
|
"grad_norm": 0.1334681175867728,
|
|
"learning_rate": 1.1934346574768547e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 101969857.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1224
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.5628430692729714e-09,
|
|
"advantages/std": 0.5227974057197571,
|
|
"advantages/var": 0.2733171274273083,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.5189964157706095,
|
|
"grad_norm": 0.17314748462184348,
|
|
"learning_rate": 1.192330574691037e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 102052362.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.14465448260307312,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125782003796406e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.521863799283154,
|
|
"grad_norm": 0.2015930023571842,
|
|
"learning_rate": 1.191226248394995e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 102135090.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1226
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.439373903985093e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.524731182795699,
|
|
"grad_norm": 0.1360875524771212,
|
|
"learning_rate": 1.1901216799869188e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 102221610.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1227
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.9875550720364307e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.5275985663082436,
|
|
"grad_norm": 0.13875885976895325,
|
|
"learning_rate": 1.1890168708653053e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 102298993.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1228
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.672251731040016e-09,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.5304659498207887,
|
|
"grad_norm": 0.14595977161720528,
|
|
"learning_rate": 1.1879118224289561e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 102390020.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1229
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.8994882297977766e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 3.533333333333333,
|
|
"grad_norm": 0.1415508927459141,
|
|
"learning_rate": 1.1868065360769758e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 102468791.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899474012416308e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.5362007168458782,
|
|
"grad_norm": 0.09199765758500333,
|
|
"learning_rate": 1.1857010132087704e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 102545313.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.08443662524223328,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1231
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 5.749445740229558e-09,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.539068100358423,
|
|
"grad_norm": 0.07560841262380356,
|
|
"learning_rate": 1.1845952552240448e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 102626121.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1232
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983473280509385e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.541935483870968,
|
|
"grad_norm": 0.1342972638035452,
|
|
"learning_rate": 1.1834892635228022e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 102712264.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1233
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814455009491016e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.5448028673835124,
|
|
"grad_norm": 0.13968579486800936,
|
|
"learning_rate": 1.1823830395053416e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 102789811.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1234
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.5476702508960574,
|
|
"grad_norm": 0.13576783036106513,
|
|
"learning_rate": 1.1812765845722559e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 102868670.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.9835626514248234e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 3.5505376344086024,
|
|
"grad_norm": 0.131430380104203,
|
|
"learning_rate": 1.1801699001244304e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 102951361.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1236
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 1.0349153895649778e-08,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.553405017921147,
|
|
"grad_norm": 0.0740815983049289,
|
|
"learning_rate": 1.1790629875630412e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 103026292.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1237
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.5562724014336915,
|
|
"grad_norm": 0.15201122837467138,
|
|
"learning_rate": 1.1779558482895528e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 103109879.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1238
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814371735978847e-09,
|
|
"advantages/std": 0.5227928161621094,
|
|
"advantages/var": 0.2733123286307091,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.5591397849462365,
|
|
"grad_norm": 0.142543365316104,
|
|
"learning_rate": 1.1768484837057175e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 103195818.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.1412346363067627,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1239
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.3444010916782605e-09,
|
|
"advantages/std": 0.5227840542793274,
|
|
"advantages/var": 0.27330316740873073,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.5620071684587815,
|
|
"grad_norm": 0.1421483267478702,
|
|
"learning_rate": 1.1757408952135722e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 103277159.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.13204573094844818,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"advantages/mean": -8.381903171539307e-09,
|
|
"advantages/snr": 1.463624342391056e-08,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.564874551971326,
|
|
"grad_norm": 0.11733338278671168,
|
|
"learning_rate": 1.174633084215437e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 103359879.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1241
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.971078240891425e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.567741935483871,
|
|
"grad_norm": 0.0809665211104052,
|
|
"learning_rate": 1.1735250521139148e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 103437878.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1242
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7814134106090637e-09,
|
|
"advantages/std": 0.5227997899055481,
|
|
"advantages/var": 0.27331962032528523,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.5706093189964156,
|
|
"grad_norm": 0.11570311984898715,
|
|
"learning_rate": 1.1724168003118874e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 103524530.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.14806944131851196,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1243
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899474012416308e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.5734767025089607,
|
|
"grad_norm": 0.1183125986099129,
|
|
"learning_rate": 1.1713083302125158e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 103597931.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1244
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.6185632348060608,
|
|
"advantages/var": 0.3826204754537379,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.576344086021505,
|
|
"grad_norm": 0.1298317431898703,
|
|
"learning_rate": 1.170199643219236e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 103688988.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.17859892547130585,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.5792114695340502,
|
|
"grad_norm": 0.08847302241965387,
|
|
"learning_rate": 1.16909074073576e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 103761600.0,
|
|
"reward": 0.9765625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9765625,
|
|
"rewards/drgrpo_math_reward/std": 0.15188287198543549,
|
|
"step": 1246
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199094228701277e-09,
|
|
"advantages/std": 0.4049627184867859,
|
|
"advantages/var": 0.1639948033642078,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.5820788530465952,
|
|
"grad_norm": 0.0733599792761376,
|
|
"learning_rate": 1.1679816241660717e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 103844973.0,
|
|
"reward": 0.5625,
|
|
"reward_std": 0.09127141535282135,
|
|
"rewards/drgrpo_math_reward/mean": 0.5625,
|
|
"rewards/drgrpo_math_reward/std": 0.49802759289741516,
|
|
"step": 1247
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.878747807970186e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.58494623655914,
|
|
"grad_norm": 0.16772127285058927,
|
|
"learning_rate": 1.1668722949144266e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 103930651.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1248
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.449710856633628e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.5878136200716844,
|
|
"grad_norm": 0.12085872474781346,
|
|
"learning_rate": 1.165762754385349e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 104005167.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1249
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599795920412534e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.5906810035842294,
|
|
"grad_norm": 0.08844179403853,
|
|
"learning_rate": 1.1646530039836311e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 104083834.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.252531375408196e-09,
|
|
"advantages/std": 0.5726755261421204,
|
|
"advantages/var": 0.3279572582421544,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.5935483870967744,
|
|
"grad_norm": 0.12119886337231127,
|
|
"learning_rate": 1.1635430451143307e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 104162273.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.15072786808013916,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1251
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.131418995429572e-09,
|
|
"advantages/std": 0.5726691484451294,
|
|
"advantages/var": 0.32794995358086965,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.596415770609319,
|
|
"grad_norm": 0.1272812772182248,
|
|
"learning_rate": 1.1624328791827696e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 104235263.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.14389309287071228,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1252
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.599283154121864,
|
|
"grad_norm": 0.03398207896287452,
|
|
"learning_rate": 1.1613225075945314e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 104321147.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 1253
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.6021505376344085,
|
|
"grad_norm": 0.08432849289207688,
|
|
"learning_rate": 1.1602119317554603e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 104396372.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 1254
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.131246346616979e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.6050179211469535,
|
|
"grad_norm": 0.12084258854824852,
|
|
"learning_rate": 1.15910115307166e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 104476369.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.607885304659498,
|
|
"grad_norm": 0.08778679284878087,
|
|
"learning_rate": 1.1579901729494889e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 104545444.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1256
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.610752688172043,
|
|
"grad_norm": 0.03071083150850874,
|
|
"learning_rate": 1.156878992795563e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 104627475.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1257
|
|
},
|
|
{
|
|
"advantages/mean": 4.6566128730773926e-09,
|
|
"advantages/snr": 1.4082993428404723e-08,
|
|
"advantages/std": 0.330655038356781,
|
|
"advantages/var": 0.10933275439072432,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.6136200716845877,
|
|
"grad_norm": 0.08635758229878303,
|
|
"learning_rate": 1.155767614016749e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 104711766.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 1258
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 4.516765728025316e-09,
|
|
"advantages/std": 0.6185770630836487,
|
|
"advantages/var": 0.3826375829731923,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.6164874551971327,
|
|
"grad_norm": 0.12554710188999404,
|
|
"learning_rate": 1.1546560380201678e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 104805976.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.19568344950675964,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1259
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599467569791755e-09,
|
|
"advantages/std": 0.404969722032547,
|
|
"advantages/var": 0.16400047576311838,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.6193548387096772,
|
|
"grad_norm": 0.08417377447220432,
|
|
"learning_rate": 1.1535442662131873e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 104883446.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.09916213154792786,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 6.899540529955257e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.6222222222222222,
|
|
"grad_norm": 0.1296610194185722,
|
|
"learning_rate": 1.1524323000034254e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 104972560.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1261
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.3942077395823529e-08,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.6250896057347672,
|
|
"grad_norm": 0.0770382463476364,
|
|
"learning_rate": 1.151320140798745e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 105053986.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1262
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.627956989247312,
|
|
"grad_norm": 0.058208183836016815,
|
|
"learning_rate": 1.1502077900072533e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 105124167.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1263
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 7.966813525430481e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.6308243727598564,
|
|
"grad_norm": 0.11667705362885708,
|
|
"learning_rate": 1.1490952490373012e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 105209877.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1264
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.5167470134271535e-09,
|
|
"advantages/std": 0.618579626083374,
|
|
"advantages/var": 0.3826407538054468,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 3.6336917562724014,
|
|
"grad_norm": 0.1565468928481836,
|
|
"learning_rate": 1.147982519297479e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 105297314.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.2001592218875885,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.816724861393605e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.6365591397849464,
|
|
"grad_norm": 0.12059672816300006,
|
|
"learning_rate": 1.1468696021966171e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 105365104.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 1266
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.639426523297491,
|
|
"grad_norm": 0.11550535544705419,
|
|
"learning_rate": 1.1457564991437823e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 105442599.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1267
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983473280509385e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.642293906810036,
|
|
"grad_norm": 0.08574098513112333,
|
|
"learning_rate": 1.1446432115482772e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 105514011.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1268
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.6451612903225805,
|
|
"grad_norm": 0.05984439476964636,
|
|
"learning_rate": 1.143529740819638e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 105593332.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1269
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.6480286738351255,
|
|
"grad_norm": 0.11318223010227013,
|
|
"learning_rate": 1.1424160883676332e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 105674573.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.505601632872161e-09,
|
|
"advantages/std": 0.6185716986656189,
|
|
"advantages/var": 0.3826309463900692,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.65089605734767,
|
|
"grad_norm": 0.18279507068056253,
|
|
"learning_rate": 1.1413022556022606e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 105756016.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.18990948796272278,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1271
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299875955969384e-09,
|
|
"advantages/std": 0.40494468808174133,
|
|
"advantages/var": 0.16398020040561878,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.653763440860215,
|
|
"grad_norm": 0.07169495289737582,
|
|
"learning_rate": 1.1401882439337464e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 105835971.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1272
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.3009992800507722e-08,
|
|
"advantages/std": 0.5726813673973083,
|
|
"advantages/var": 0.32796394856405087,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.65663082437276,
|
|
"grad_norm": 0.13847441778441305,
|
|
"learning_rate": 1.1390740547725442e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 105919242.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.15650182962417603,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1273
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599512249801046e-09,
|
|
"advantages/std": 0.4049657881259918,
|
|
"advantages/var": 0.1639972895525057,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.6594982078853047,
|
|
"grad_norm": 0.12531312744111278,
|
|
"learning_rate": 1.1379596895293314e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 105994890.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.0946863517165184,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1274
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.6623655913978492,
|
|
"grad_norm": 0.15310330840994021,
|
|
"learning_rate": 1.1368451496150087e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 106074174.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227856636047363,
|
|
"advantages/var": 0.27330485007064453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.6652329749103942,
|
|
"grad_norm": 0.1268705759404372,
|
|
"learning_rate": 1.1357304364406978e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 106168247.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.13098980486392975,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1276
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.0349462367204652e-08,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.6681003584229392,
|
|
"grad_norm": 0.08277826866888821,
|
|
"learning_rate": 1.1346155514177398e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 106237965.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1277
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917813257124117e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.670967741935484,
|
|
"grad_norm": 0.1021870065331042,
|
|
"learning_rate": 1.1335004959576932e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 106318420.0,
|
|
"reward": 0.5078125,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.5078125,
|
|
"rewards/drgrpo_math_reward/std": 0.5019033551216125,
|
|
"step": 1278
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.673835125448029,
|
|
"grad_norm": 0.018836866369400712,
|
|
"learning_rate": 1.1323852714723335e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 106397754.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1279
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917813257124117e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.6767025089605734,
|
|
"grad_norm": 0.14637770234607897,
|
|
"learning_rate": 1.131269879373648e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 106479388.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.09522313624620438,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.4083760350291931e-09,
|
|
"advantages/std": 0.6612740755081177,
|
|
"advantages/var": 0.4372834029391157,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.6795698924731184,
|
|
"grad_norm": 0.16404051028339522,
|
|
"learning_rate": 1.1301543210738382e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 106552798.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.20964756608009338,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1281
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7814773893309386e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.682437275985663,
|
|
"grad_norm": 0.17002361057183865,
|
|
"learning_rate": 1.1290385979853151e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 106627951.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1282
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.330655038356781,
|
|
"advantages/var": 0.10933275439072432,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.685304659498208,
|
|
"grad_norm": 0.1620477574415513,
|
|
"learning_rate": 1.1279227115206986e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 106704949.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1283
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.033618501892656e-09,
|
|
"advantages/std": 0.6185711026191711,
|
|
"advantages/var": 0.38263020899549716,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 3.688172043010753,
|
|
"grad_norm": 0.15870349587849134,
|
|
"learning_rate": 1.126806663092815e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 106798802.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.18884867429733276,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 1284
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344407185044788e-09,
|
|
"advantages/std": 0.5227834582328796,
|
|
"advantages/var": 0.273302544201929,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.6910394265232975,
|
|
"grad_norm": 0.1153759937149323,
|
|
"learning_rate": 1.1256904541146965e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 106888194.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.13098490238189697,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.970882307601416e-09,
|
|
"advantages/std": 0.4676063656806946,
|
|
"advantages/var": 0.21865571322510746,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 3.693906810035842,
|
|
"grad_norm": 0.09479971517786234,
|
|
"learning_rate": 1.124574085999578e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 106966067.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.11913755536079407,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1286
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.696774193548387,
|
|
"grad_norm": 0.0636527001797721,
|
|
"learning_rate": 1.1234575601608955e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 107047911.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 1287
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.9834382441424445e-09,
|
|
"advantages/std": 0.4675973355770111,
|
|
"advantages/var": 0.21864726823871994,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.699641577060932,
|
|
"grad_norm": 0.15409206568396186,
|
|
"learning_rate": 1.1223408780122859e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 107127238.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.11100947856903076,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1288
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.5995914574054795e-09,
|
|
"advantages/std": 0.4049588143825531,
|
|
"advantages/var": 0.1639916413461231,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.7025089605734767,
|
|
"grad_norm": 0.081940982852503,
|
|
"learning_rate": 1.1212240409675824e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 107211703.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.0867956355214119,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1289
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899474012416308e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.7053763440860212,
|
|
"grad_norm": 0.07718402860658599,
|
|
"learning_rate": 1.120107050440816e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 107297379.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299829409932592e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.7082437275985662,
|
|
"grad_norm": 0.16432220896658642,
|
|
"learning_rate": 1.1189899078462106e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 107369068.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1291
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016360347740786e-09,
|
|
"advantages/std": 0.5227997899055481,
|
|
"advantages/var": 0.27331962032528523,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.7111111111111112,
|
|
"grad_norm": 0.1053941974370414,
|
|
"learning_rate": 1.117872614598184e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 107457753.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.14806942641735077,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1292
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962665216109293e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.713978494623656,
|
|
"grad_norm": 0.11531907034017322,
|
|
"learning_rate": 1.1167551721113434e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 107536743.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1293
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.065623173308489e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 3.716845878136201,
|
|
"grad_norm": 0.09979824294231059,
|
|
"learning_rate": 1.1156375818004855e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 107627363.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1294
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 7.528079978419126e-09,
|
|
"advantages/std": 0.6185657978057861,
|
|
"advantages/var": 0.3826236462151087,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 3.7197132616487454,
|
|
"grad_norm": 0.10167525036542162,
|
|
"learning_rate": 1.1145198450805945e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 107715699.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.1830747127532959,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.967079601050182e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.7225806451612904,
|
|
"grad_norm": 0.17046357996494574,
|
|
"learning_rate": 1.1134019633668396e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 107788985.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1296
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917366402546925e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.725448028673835,
|
|
"grad_norm": 0.08946783404091041,
|
|
"learning_rate": 1.1122839380745737e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 107871812.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1297
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.3361080419982039e-08,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 3.72831541218638,
|
|
"grad_norm": 0.09614758200353986,
|
|
"learning_rate": 1.1111657706193312e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 107947065.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1298
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.672108843924502e-09,
|
|
"advantages/std": 0.5228019952774048,
|
|
"advantages/var": 0.2733219262660356,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 3.731182795698925,
|
|
"grad_norm": 0.1333434622161509,
|
|
"learning_rate": 1.1100474624168268e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 108035921.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.14807432889938354,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1299
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.7340501792114695,
|
|
"grad_norm": 0.03215331874989343,
|
|
"learning_rate": 1.1089290148829536e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 108113233.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.967257841033182e-09,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.736917562724014,
|
|
"grad_norm": 0.09556214212903666,
|
|
"learning_rate": 1.1078104294337804e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 108201219.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 1301
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983400669593257e-09,
|
|
"advantages/std": 0.4676017463207245,
|
|
"advantages/var": 0.21865139316219118,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.739784946236559,
|
|
"grad_norm": 0.09208740991565705,
|
|
"learning_rate": 1.1066917074855515e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 108271266.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.11336850374937057,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1302
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344329800322181e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.742652329749104,
|
|
"grad_norm": 0.13102744513708434,
|
|
"learning_rate": 1.1055728504546833e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 108349729.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1303
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983450684521008e-09,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 3.7455197132616487,
|
|
"grad_norm": 0.10207573847644788,
|
|
"learning_rate": 1.1044538597577637e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 108444578.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1304
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 5.691872442631884e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.7483870967741937,
|
|
"grad_norm": 0.10111644770835693,
|
|
"learning_rate": 1.1033347368115494e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 108524385.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.15650182962417603,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1498692618242365e-08,
|
|
"advantages/std": 0.40496888756752014,
|
|
"advantages/var": 0.16399979989767477,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.7512544802867382,
|
|
"grad_norm": 0.1064704614620527,
|
|
"learning_rate": 1.1022154830329648e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 108616264.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.09810129553079605,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 1306
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.7541218637992833,
|
|
"grad_norm": 0.09424157843515399,
|
|
"learning_rate": 1.1010960998391001e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 108703232.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 1307
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.756989247311828,
|
|
"grad_norm": 0.06870232361927221,
|
|
"learning_rate": 1.099976588647209e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 108768091.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1308
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.9835626514248234e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.759856630824373,
|
|
"grad_norm": 0.13151750578685634,
|
|
"learning_rate": 1.0988569508747075e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 108847049.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1309
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.762724014336918,
|
|
"grad_norm": 0.05392749330554253,
|
|
"learning_rate": 1.0977371879391721e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 108917659.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.618579626083374,
|
|
"advantages/var": 0.3826407538054468,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.7655913978494624,
|
|
"grad_norm": 0.12948372587088572,
|
|
"learning_rate": 1.0966173012583366e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 109013921.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.2001592218875885,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1311
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1499234216592094e-08,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.768458781362007,
|
|
"grad_norm": 0.06999483842741774,
|
|
"learning_rate": 1.0954972922500935e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 109095568.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 1312
|
|
},
|
|
{
|
|
"advantages/mean": 6.984919309616089e-09,
|
|
"advantages/snr": 1.336040712984824e-08,
|
|
"advantages/std": 0.5228073596954346,
|
|
"advantages/var": 0.2733275353517115,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.771326164874552,
|
|
"grad_norm": 0.11859180878926806,
|
|
"learning_rate": 1.0943771623324882e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 109174541.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.15596505999565125,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1313
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.774193548387097,
|
|
"grad_norm": 0.03892614703730164,
|
|
"learning_rate": 1.0932569129237205e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 109256372.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1314
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.7770609318996415,
|
|
"grad_norm": 0.11123047373003779,
|
|
"learning_rate": 1.0921365454421402e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 109339020.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 6.504997077293582e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 3.7799283154121865,
|
|
"grad_norm": 0.12202431275931674,
|
|
"learning_rate": 1.0910160613062487e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 109426843.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1316
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.149914704966296e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.782795698924731,
|
|
"grad_norm": 0.10100155038741537,
|
|
"learning_rate": 1.0898954619346923e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 109509983.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 1317
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907665222004876e-09,
|
|
"advantages/std": 0.5227646827697754,
|
|
"advantages/var": 0.2732829135513839,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.785663082437276,
|
|
"grad_norm": 0.10900857756454012,
|
|
"learning_rate": 1.088774748746266e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 109585662.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.11048543453216553,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 1318
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.7885304659498207,
|
|
"grad_norm": 0.05121812077394973,
|
|
"learning_rate": 1.0876539231599067e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 109663710.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 1319
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.011193400505165e-09,
|
|
"advantages/std": 0.6185737252235413,
|
|
"advantages/var": 0.3826334535369291,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.7913978494623657,
|
|
"grad_norm": 0.13595044286737032,
|
|
"learning_rate": 1.0865329865946945e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 109744234.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.19332444667816162,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.4083577551179745e-09,
|
|
"advantages/std": 0.6612826585769653,
|
|
"advantages/var": 0.4372947545346193,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.7942652329749103,
|
|
"grad_norm": 0.15701328467704398,
|
|
"learning_rate": 1.085411940469851e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 109825414.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.21884137392044067,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1321
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.516833624148468e-09,
|
|
"advantages/std": 0.6185677647590637,
|
|
"advantages/var": 0.3826260795990244,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.7971326164874553,
|
|
"grad_norm": 0.11017406656209758,
|
|
"learning_rate": 1.0842907862047342e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 109916589.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.18648964166641235,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 1322
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016494700483271e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.8,
|
|
"grad_norm": 0.14760198313667855,
|
|
"learning_rate": 1.0831695252188413e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 109994855.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1323
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917813257124117e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.802867383512545,
|
|
"grad_norm": 0.1477807109136703,
|
|
"learning_rate": 1.0820481589318031e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 110077011.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1324
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 1.126665165824288e-08,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.80573476702509,
|
|
"grad_norm": 0.10275246143222694,
|
|
"learning_rate": 1.0809266887633848e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 110161437.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899540529955257e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.8086021505376344,
|
|
"grad_norm": 0.09403150692646986,
|
|
"learning_rate": 1.0798051161334817e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 110243698.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1326
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.811469534050179,
|
|
"grad_norm": 0.1064862118449541,
|
|
"learning_rate": 1.0786834424621209e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 110311106.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.12415502220392227,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1327
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.4536081669351505e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 3.814336917562724,
|
|
"grad_norm": 0.10207951572882269,
|
|
"learning_rate": 1.0775616691694553e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 110393058.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1328
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 5.749445740229558e-09,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 3.817204301075269,
|
|
"grad_norm": 0.13002847818344113,
|
|
"learning_rate": 1.0764397976757655e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 110474665.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 1329
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.979341600636731e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.8200716845878135,
|
|
"grad_norm": 0.09621453531039598,
|
|
"learning_rate": 1.0753178294014556e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 110555789.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.1054728701710701,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.8229390681003586,
|
|
"grad_norm": 0.08820674061444314,
|
|
"learning_rate": 1.074195765767052e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 110641561.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1331
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.4536934733273465e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.825806451612903,
|
|
"grad_norm": 0.10667133981075559,
|
|
"learning_rate": 1.073073608193203e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 110729635.0,
|
|
"reward": 0.640625,
|
|
"reward_std": 0.12756997346878052,
|
|
"rewards/drgrpo_math_reward/mean": 0.640625,
|
|
"rewards/drgrpo_math_reward/std": 0.481702595949173,
|
|
"step": 1332
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.828673835125448,
|
|
"grad_norm": 0.12307235433211015,
|
|
"learning_rate": 1.071951358100675e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 110806669.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1333
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.8315412186379927,
|
|
"grad_norm": 0.06152352254396921,
|
|
"learning_rate": 1.0708290169103514e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 110883099.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1334
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.0349232344696665e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.8344086021505377,
|
|
"grad_norm": 0.07339730279981847,
|
|
"learning_rate": 1.0697065860432314e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 110964489.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983562397524497e-09,
|
|
"advantages/std": 0.4675827622413635,
|
|
"advantages/var": 0.2186336395452635,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.8372759856630827,
|
|
"grad_norm": 0.15370363082688193,
|
|
"learning_rate": 1.0685840669204271e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 111043042.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.09522313624620438,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1336
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7814513910737996e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.8401433691756273,
|
|
"grad_norm": 0.09558394208295336,
|
|
"learning_rate": 1.0674614609631634e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 111123683.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1337
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.843010752688172,
|
|
"grad_norm": 0.12520949146235022,
|
|
"learning_rate": 1.0663387695927742e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 111201382.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1338
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.1579757501173404e-08,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.845878136200717,
|
|
"grad_norm": 0.17554273424862962,
|
|
"learning_rate": 1.065215994230702e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 111273465.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1339
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876049603820392e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.848745519713262,
|
|
"grad_norm": 0.06860011960572143,
|
|
"learning_rate": 1.0640931362984955e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 111353193.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.131077090192316e-09,
|
|
"advantages/std": 0.5726932287216187,
|
|
"advantages/var": 0.3279775342235922,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.8516129032258064,
|
|
"grad_norm": 0.11329417129773361,
|
|
"learning_rate": 1.0629701972178078e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 111436228.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.172288179397583,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1341
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.98750518535691e-09,
|
|
"advantages/std": 0.46760883927345276,
|
|
"advantages/var": 0.21865802656666578,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.8544802867383514,
|
|
"grad_norm": 0.09480214205734269,
|
|
"learning_rate": 1.061847178410395e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 111524936.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.11914245784282684,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1342
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.29975138607539e-09,
|
|
"advantages/std": 0.4049666225910187,
|
|
"advantages/var": 0.16399796541277656,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.857347670250896,
|
|
"grad_norm": 0.13440261245254936,
|
|
"learning_rate": 1.0607240812981144e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 111600908.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.09574718773365021,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1343
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.4948445614343708e-08,
|
|
"advantages/std": 0.40496495366096497,
|
|
"advantages/var": 0.1639966136936275,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.860215053763441,
|
|
"grad_norm": 0.08384584122117245,
|
|
"learning_rate": 1.059600907302921e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 111687198.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.0936255231499672,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 1344
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 7.966859224177393e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.8630824372759855,
|
|
"grad_norm": 0.14578138902811386,
|
|
"learning_rate": 1.0584776578468697e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 111761184.0,
|
|
"reward": 0.6171875,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.6171875,
|
|
"rewards/drgrpo_math_reward/std": 0.4879830479621887,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899421713267256e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.8659498207885306,
|
|
"grad_norm": 0.09092611235150355,
|
|
"learning_rate": 1.0573543343521082e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 111839818.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1346
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.9833176526633854e-09,
|
|
"advantages/std": 0.46761149168014526,
|
|
"advantages/var": 0.21866050715133056,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 3.868817204301075,
|
|
"grad_norm": 0.08858353776566967,
|
|
"learning_rate": 1.0562309382408798e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 111924067.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.12255740165710449,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1347
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 7.041931588438948e-09,
|
|
"advantages/std": 0.6612692475318909,
|
|
"advantages/var": 0.43727701773139316,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.87168458781362,
|
|
"grad_norm": 0.1840637119804573,
|
|
"learning_rate": 1.055107470935519e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 112010302.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.20069600641727448,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1348
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.6612985134124756,
|
|
"advantages/var": 0.43731572384155015,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.8745519713261647,
|
|
"grad_norm": 0.1370269768902454,
|
|
"learning_rate": 1.0539839338584508e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 112108864.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.24146251380443573,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 1349
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.8774193548387097,
|
|
"grad_norm": 0.04320776006081769,
|
|
"learning_rate": 1.0528603284321878e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 112185742.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.3009750424828314e-08,
|
|
"advantages/std": 0.5726920366287231,
|
|
"advantages/var": 0.32797616881795477,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 3.8802867383512547,
|
|
"grad_norm": 0.1255881114221988,
|
|
"learning_rate": 1.0517366560793304e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 112270299.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.1701665222644806,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1351
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983384167481491e-09,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.8831541218637993,
|
|
"grad_norm": 0.16555960495596858,
|
|
"learning_rate": 1.0506129182225626e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 112351127.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1352
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599694025151775e-09,
|
|
"advantages/std": 0.40494978427886963,
|
|
"advantages/var": 0.16398432778750305,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.886021505376344,
|
|
"grad_norm": 0.08557565482351023,
|
|
"learning_rate": 1.0494891162846513e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 112424515.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1353
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 3.888888888888889,
|
|
"grad_norm": 0.09457011403274432,
|
|
"learning_rate": 1.0483652516884458e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 112498953.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1354
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962814881146116e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.891756272401434,
|
|
"grad_norm": 0.1745861528850439,
|
|
"learning_rate": 1.0472413258568733e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 112571392.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.8946236559139784,
|
|
"grad_norm": 0.07075151530037413,
|
|
"learning_rate": 1.0461173402129393e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 112650111.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1356
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.8974910394265234,
|
|
"grad_norm": 0.1079626188074426,
|
|
"learning_rate": 1.0449932961797247e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 112715656.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1357
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.703125,
|
|
"epoch": 3.900358422939068,
|
|
"grad_norm": 0.07701497381093976,
|
|
"learning_rate": 1.0438691951803848e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 112802452.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1358
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 3.903225806451613,
|
|
"grad_norm": 0.06111207006774216,
|
|
"learning_rate": 1.0427450386381462e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 112885542.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1359
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.9060931899641576,
|
|
"grad_norm": 0.07875273671794938,
|
|
"learning_rate": 1.0416208279763073e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 112961615.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.9089605734767026,
|
|
"grad_norm": 0.08032362493131201,
|
|
"learning_rate": 1.0404965646182329e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 113046939.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 1361
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962814881146116e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.9118279569892476,
|
|
"grad_norm": 0.1297952360056973,
|
|
"learning_rate": 1.0393722499873562e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 113134836.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 1362
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.235142426779239e-09,
|
|
"advantages/std": 0.5227833986282349,
|
|
"advantages/var": 0.2733024818812879,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.914695340501792,
|
|
"grad_norm": 0.15569561366482432,
|
|
"learning_rate": 1.038247885507175e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 113226761.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.13098490238189697,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1363
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.379887186086637e-08,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.9175627240143367,
|
|
"grad_norm": 0.11041596653421341,
|
|
"learning_rate": 1.0371234726012496e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 113294239.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1364
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.9204301075268817,
|
|
"grad_norm": 0.05127564834307841,
|
|
"learning_rate": 1.0359990126932022e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 113376874.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.9232974910394267,
|
|
"grad_norm": 0.09498261488724016,
|
|
"learning_rate": 1.0348745072067141e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 113455819.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1366
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599795920412534e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 3.9261648745519713,
|
|
"grad_norm": 0.0646405995256052,
|
|
"learning_rate": 1.033749957565525e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 113543974.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1367
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.9290322580645163,
|
|
"grad_norm": 0.1435436920010314,
|
|
"learning_rate": 1.0326253651934294e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 113609869.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1368
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016351208262037e-09,
|
|
"advantages/std": 0.5228003859519958,
|
|
"advantages/var": 0.2733202435515558,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.931899641577061,
|
|
"grad_norm": 0.1259963606265134,
|
|
"learning_rate": 1.031500731514277e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 113704248.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.14913026988506317,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 1369
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450174584180815e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.934767025089606,
|
|
"grad_norm": 0.05836523933563015,
|
|
"learning_rate": 1.030376057951969e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 113774762.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 3.9376344086021504,
|
|
"grad_norm": 0.04136085261347344,
|
|
"learning_rate": 1.029251345930458e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 113853475.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1371
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 7.041743076701556e-10,
|
|
"advantages/std": 0.6612869501113892,
|
|
"advantages/var": 0.4373004303876229,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.9405017921146954,
|
|
"grad_norm": 0.2235437844824043,
|
|
"learning_rate": 1.028126596873744e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 113945853.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.22673210501670837,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 1372
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.94336917562724,
|
|
"grad_norm": 0.06286535968371615,
|
|
"learning_rate": 1.0270018122058753e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 114013504.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1373
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262188031035393e-09,
|
|
"advantages/std": 0.5726920366287231,
|
|
"advantages/var": 0.32797616881795477,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.765625,
|
|
"epoch": 3.946236559139785,
|
|
"grad_norm": 0.11232914675015847,
|
|
"learning_rate": 1.0258769933509438e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 114101359.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.1701665222644806,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1374
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.9491039426523296,
|
|
"grad_norm": 0.11737911669960818,
|
|
"learning_rate": 1.0247521417330863e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 114176210.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975101004389886e-09,
|
|
"advantages/std": 0.4676017463207245,
|
|
"advantages/var": 0.21865139316219118,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.9519713261648746,
|
|
"grad_norm": 0.0874895004538831,
|
|
"learning_rate": 1.0236272587764798e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 114244963.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.11336849629878998,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1376
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983629174425397e-09,
|
|
"advantages/std": 0.46757492423057556,
|
|
"advantages/var": 0.21862630976922848,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.9548387096774196,
|
|
"grad_norm": 0.11009512079189243,
|
|
"learning_rate": 1.0225023459053415e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 114330630.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1377
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262492693233955e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.957706093189964,
|
|
"grad_norm": 0.1347477292006464,
|
|
"learning_rate": 1.0213774045439265e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 114409319.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1378
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.9605734767025087,
|
|
"grad_norm": 0.10334727924039125,
|
|
"learning_rate": 1.0202524361165255e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 114481299.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1379
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.9634408602150537,
|
|
"grad_norm": 0.09093757904017581,
|
|
"learning_rate": 1.0191274420474647e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 114560714.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.9663082437275987,
|
|
"grad_norm": 0.08678117670580014,
|
|
"learning_rate": 1.0180024237611009e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 114632344.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1381
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 3.9691756272401433,
|
|
"grad_norm": 0.07109617726796713,
|
|
"learning_rate": 1.0168773826818235e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 114704697.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1382
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.9720430107526883,
|
|
"grad_norm": 0.07477735095983636,
|
|
"learning_rate": 1.015752320234049e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 114780505.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1383
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.131056779894869e-09,
|
|
"advantages/std": 0.5726946592330933,
|
|
"advantages/var": 0.3279791727141088,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 3.974910394265233,
|
|
"grad_norm": 0.10464202287838072,
|
|
"learning_rate": 1.0146272378422227e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 114878778.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.17123225331306458,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1384
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.977777777777778,
|
|
"grad_norm": 0.08964071801741949,
|
|
"learning_rate": 1.0135021369308136e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 114944000.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.9806451612903224,
|
|
"grad_norm": 0.029530648555468466,
|
|
"learning_rate": 1.0123770189243149e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 115012550.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1386
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.672120115913595e-09,
|
|
"advantages/std": 0.5227997899055481,
|
|
"advantages/var": 0.27331962032528523,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 3.9835125448028674,
|
|
"grad_norm": 0.13306170651545898,
|
|
"learning_rate": 1.0112518852472413e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 115102429.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.14806944131851196,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1387
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.4082993428404723e-08,
|
|
"advantages/std": 0.330655038356781,
|
|
"advantages/var": 0.10933275439072432,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 3.9863799283154124,
|
|
"grad_norm": 0.050078905854352956,
|
|
"learning_rate": 1.0101267373241277e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 115166656.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1388
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.989247311827957,
|
|
"grad_norm": 0.05174973349172179,
|
|
"learning_rate": 1.0090015765795264e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 115240570.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1389
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.5995914574054795e-09,
|
|
"advantages/std": 0.4049588143825531,
|
|
"advantages/var": 0.1639916413461231,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 3.9921146953405016,
|
|
"grad_norm": 0.09582298486917787,
|
|
"learning_rate": 1.0078764044380063e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 115319665.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.0867956355214119,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 3.9949820788530466,
|
|
"grad_norm": 0.08362897906725004,
|
|
"learning_rate": 1.0067512223241507e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 115389212.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1391
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599796258942519e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 3.9978494623655916,
|
|
"grad_norm": 0.08647927186288491,
|
|
"learning_rate": 1.0056260316625558e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 115469012.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1392
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.002867383512545,
|
|
"grad_norm": 0.05710977983237636,
|
|
"learning_rate": 1.0045008338778277e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 115555267.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1393
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917003347966285e-09,
|
|
"advantages/std": 0.4676017463207245,
|
|
"advantages/var": 0.21865139316219118,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.00573476702509,
|
|
"grad_norm": 0.1891657059044791,
|
|
"learning_rate": 1.0033756303945828e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 115633653.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.11336849629878998,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1394
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.907227504745508e-10,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.008602150537635,
|
|
"grad_norm": 0.14291200048617728,
|
|
"learning_rate": 1.0022504226374438e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 115716623.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 8.450174584180815e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.011469534050179,
|
|
"grad_norm": 0.04918178407942894,
|
|
"learning_rate": 1.0011252120310387e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 115801496.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1396
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 5.691702426092771e-09,
|
|
"advantages/std": 0.5726984143257141,
|
|
"advantages/var": 0.3279834737711873,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.014336917562724,
|
|
"grad_norm": 0.1371515913326463,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 115884411.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.17700131237506866,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1397
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.970961834751672e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.017204301075269,
|
|
"grad_norm": 0.09653238089827981,
|
|
"learning_rate": 9.988747879689612e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 115964895.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1398
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.2470398597593402e-08,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.020071684587814,
|
|
"grad_norm": 0.11479796353253469,
|
|
"learning_rate": 9.97749577362556e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 116046692.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.12415501475334167,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1399
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.022939068100358,
|
|
"grad_norm": 0.065055089170235,
|
|
"learning_rate": 9.966243696054175e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 116123982.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.453528449034464e-09,
|
|
"advantages/std": 0.5228003859519958,
|
|
"advantages/var": 0.2733202435515558,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.025806451612903,
|
|
"grad_norm": 0.132510357212156,
|
|
"learning_rate": 9.954991661221722e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 116212703.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.14913025498390198,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1401
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 7.041577316723057e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.028673835125448,
|
|
"grad_norm": 4.972098368851917,
|
|
"learning_rate": 9.943739683374443e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 116286447.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1402
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.031541218637993,
|
|
"grad_norm": 0.07543600126473533,
|
|
"learning_rate": 9.93248777675849e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 116359830.0,
|
|
"reward": 0.9765625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9765625,
|
|
"rewards/drgrpo_math_reward/std": 0.15188287198543549,
|
|
"step": 1403
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.2470118506643711e-08,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 4.034408602150537,
|
|
"grad_norm": 0.09332703119373481,
|
|
"learning_rate": 9.921235955619938e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 116446264.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1404
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876049603820392e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.037275985663083,
|
|
"grad_norm": 0.08579305490742774,
|
|
"learning_rate": 9.909984234204737e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 116526150.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.040143369175627,
|
|
"grad_norm": 0.08498597057038827,
|
|
"learning_rate": 9.898732626758724e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 116594493.0,
|
|
"reward": 0.9921875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9921875,
|
|
"rewards/drgrpo_math_reward/std": 0.0883883461356163,
|
|
"step": 1406
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.4536335554007825e-09,
|
|
"advantages/std": 0.5227880477905273,
|
|
"advantages/var": 0.2733073429126307,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.043010752688172,
|
|
"grad_norm": 0.09641363420996596,
|
|
"learning_rate": 9.887481147527586e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 116675425.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1407
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.6612716317176819,
|
|
"advantages/var": 0.4372801709145655,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.045878136200717,
|
|
"grad_norm": 0.17621604071730299,
|
|
"learning_rate": 9.876229810756855e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 116764230.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.20517177879810333,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 1408
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.048745519713262,
|
|
"grad_norm": 0.06353852467813742,
|
|
"learning_rate": 9.864978630691865e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 116827715.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1409
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.051612903225807,
|
|
"grad_norm": 0.11046684032249461,
|
|
"learning_rate": 9.853727621577773e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 116907743.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876548503938182e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.054480286738351,
|
|
"grad_norm": 0.11414665198528622,
|
|
"learning_rate": 9.842476797659508e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 116972996.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.09863808006048203,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1411
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.23513460695797e-09,
|
|
"advantages/std": 0.5227840542793274,
|
|
"advantages/var": 0.27330316740873073,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.057347670250896,
|
|
"grad_norm": 0.13426543371669594,
|
|
"learning_rate": 9.831226173181769e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 117056953.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.13204574584960938,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1412
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.060215053763441,
|
|
"grad_norm": 0.07070571532585718,
|
|
"learning_rate": 9.819975762388993e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 117123155.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1413
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299811976811062e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.063082437275986,
|
|
"grad_norm": 0.08264438903728014,
|
|
"learning_rate": 9.808725579525354e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 117210931.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1414
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.06594982078853,
|
|
"grad_norm": 0.040465076588479504,
|
|
"learning_rate": 9.797475638834744e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 117286212.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907307732763402e-09,
|
|
"advantages/std": 0.5227856636047363,
|
|
"advantages/var": 0.27330485007064453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.068817204301075,
|
|
"grad_norm": 0.14344840187422328,
|
|
"learning_rate": 9.786225954560738e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 117368830.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.13098981976509094,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1416
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.626265687704098e-09,
|
|
"advantages/std": 0.5726755261421204,
|
|
"advantages/var": 0.3279572582421544,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.07168458781362,
|
|
"grad_norm": 0.18072254333567256,
|
|
"learning_rate": 9.774976540946586e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 117453632.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.15072788298130035,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1417
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1499234216592094e-08,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.074551971326165,
|
|
"grad_norm": 0.08223549807688292,
|
|
"learning_rate": 9.763727412235201e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 117529795.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 1418
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.077419354838709,
|
|
"grad_norm": 0.027505824653321093,
|
|
"learning_rate": 9.752478582669136e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 117595078.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1419
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983473280509385e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.080286738351255,
|
|
"grad_norm": 0.09363819824185526,
|
|
"learning_rate": 9.74123006649056e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 117682052.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.0831541218637994,
|
|
"grad_norm": 0.07481291580787158,
|
|
"learning_rate": 9.729981877941249e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 117759506.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 1421
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.4083154633446115e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.086021505376344,
|
|
"grad_norm": 0.06013183556706222,
|
|
"learning_rate": 9.71873403126256e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 117828711.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1422
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814751550759118e-09,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.088888888888889,
|
|
"grad_norm": 0.09458985029231685,
|
|
"learning_rate": 9.707486540695418e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 117908229.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1423
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.2524951534513563e-09,
|
|
"advantages/std": 0.5726819038391113,
|
|
"advantages/var": 0.32796456298478915,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.091756272401434,
|
|
"grad_norm": 0.17279198048030153,
|
|
"learning_rate": 9.69623942048031e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 117996385.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.15756267309188843,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1424
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.4083154633446115e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.094623655913979,
|
|
"grad_norm": 0.06020982212956875,
|
|
"learning_rate": 9.68499268485723e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 118081885.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.5628597236829876e-09,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.097491039426523,
|
|
"grad_norm": 0.12405911654579116,
|
|
"learning_rate": 9.673746348065708e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 118157017.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1426
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 7.3180584939108565e-09,
|
|
"advantages/std": 0.5726862549781799,
|
|
"advantages/var": 0.3279695466409329,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.100358422939068,
|
|
"grad_norm": 0.09539003259588408,
|
|
"learning_rate": 9.66250042434475e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 118241083.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.16439256072044373,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 1427
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299811976811062e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.103225806451613,
|
|
"grad_norm": 0.14925567107369417,
|
|
"learning_rate": 9.651254927932862e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 118320584.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1428
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.562997839424082e-09,
|
|
"advantages/std": 0.5227746963500977,
|
|
"advantages/var": 0.2732933831439368,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.106093189964158,
|
|
"grad_norm": 0.08122134763980224,
|
|
"learning_rate": 9.64000987306798e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 118415711.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.12179600447416306,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1429
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.108960573476702,
|
|
"grad_norm": 0.058516452906352656,
|
|
"learning_rate": 9.628765273987505e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 118501690.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7814432667740602e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.111827956989248,
|
|
"grad_norm": 0.15317697617053813,
|
|
"learning_rate": 9.61752114492825e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 118574370.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1431
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983473280509385e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.114695340501792,
|
|
"grad_norm": 0.13592734930576464,
|
|
"learning_rate": 9.60627750012644e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 118656650.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.1054728701710701,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1432
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.117562724014337,
|
|
"grad_norm": 0.10393644269771243,
|
|
"learning_rate": 9.595034353817672e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 118739205.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1433
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.130986540898401e-09,
|
|
"advantages/std": 0.5726996064186096,
|
|
"advantages/var": 0.32798483919203036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.120430107526881,
|
|
"grad_norm": 0.18049214695560123,
|
|
"learning_rate": 9.583791720236928e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 118814930.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.17912296950817108,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1434
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.123297491039427,
|
|
"grad_norm": 0.058660517467965995,
|
|
"learning_rate": 9.572549613618537e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 118888437.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975343977137235e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.1261648745519715,
|
|
"grad_norm": 0.08115979157730528,
|
|
"learning_rate": 9.56130804819615e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 118972605.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1436
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.129032258064516,
|
|
"grad_norm": 0.09111951306603897,
|
|
"learning_rate": 9.550067038202754e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 119066369.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1437
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599796258942519e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.131899641577061,
|
|
"grad_norm": 0.07689670459835662,
|
|
"learning_rate": 9.538826597870609e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 119143072.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1438
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917813257124117e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.134767025089606,
|
|
"grad_norm": 0.11387495882854284,
|
|
"learning_rate": 9.527586741431268e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 119217315.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1439
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983467187183905e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.137634408602151,
|
|
"grad_norm": 0.13494130220291506,
|
|
"learning_rate": 9.516347483115544e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 119300412.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.0954734451444e-08,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 4.140501792114695,
|
|
"grad_norm": 0.11342136827484776,
|
|
"learning_rate": 9.505108837153488e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 119385911.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1441
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125995678848164e-09,
|
|
"advantages/std": 0.5227746963500977,
|
|
"advantages/var": 0.2732933831439368,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.14336917562724,
|
|
"grad_norm": 0.1520353991334203,
|
|
"learning_rate": 9.493870817774375e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 119461233.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.12179599702358246,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1442
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.146236559139785,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.482633439206695e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 119537972.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1443
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199591840825068e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.14910394265233,
|
|
"grad_norm": 0.08421970535405401,
|
|
"learning_rate": 9.47139671567812e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 119614186.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1444
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.8996420859237135e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.151971326164874,
|
|
"grad_norm": 0.11243615414331748,
|
|
"learning_rate": 9.460160661415495e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 119683159.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.235170151758147e-09,
|
|
"advantages/std": 0.5227810740470886,
|
|
"advantages/var": 0.27330005138182756,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 4.15483870967742,
|
|
"grad_norm": 0.10245184557169275,
|
|
"learning_rate": 9.448925290644812e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 119768738.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1446
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.563018557708836e-09,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.157706093189964,
|
|
"grad_norm": 0.13998590359890162,
|
|
"learning_rate": 9.437690617591202e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 119852881.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1447
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.160573476702509,
|
|
"grad_norm": 0.0988656358265864,
|
|
"learning_rate": 9.426456656478918e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 119930506.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 1448
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.8788016332274455e-09,
|
|
"advantages/std": 0.5726749897003174,
|
|
"advantages/var": 0.3279566438282586,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.163440860215053,
|
|
"grad_norm": 0.19127430664342882,
|
|
"learning_rate": 9.415223421531307e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 120018529.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1449
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.166308243727599,
|
|
"grad_norm": 0.13094005799328043,
|
|
"learning_rate": 9.403990926970789e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 120094154.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.781509278854418e-09,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.1691756272401435,
|
|
"grad_norm": 0.10072318247330422,
|
|
"learning_rate": 9.392759187018857e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 120189757.0,
|
|
"reward": 0.578125,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4957992732524872,
|
|
"step": 1451
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958501673983143e-09,
|
|
"advantages/std": 0.4676017463207245,
|
|
"advantages/var": 0.21865139316219118,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.172043010752688,
|
|
"grad_norm": 0.12826479513139036,
|
|
"learning_rate": 9.381528215896048e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 120271028.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.11336849629878998,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1452
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.065668450848788e-09,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.174910394265233,
|
|
"grad_norm": 0.1440767155168118,
|
|
"learning_rate": 9.370298027821922e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 120348702.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.14966703951358795,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1453
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917480653229804e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.177777777777778,
|
|
"grad_norm": 0.09945515704015066,
|
|
"learning_rate": 9.359068637015047e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 120423446.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1454
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907505770133387e-09,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.180645161290323,
|
|
"grad_norm": 0.1040926400199926,
|
|
"learning_rate": 9.34784005769298e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 120501820.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.8996420859237135e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.183512544802867,
|
|
"grad_norm": 0.08000849681171769,
|
|
"learning_rate": 9.336612304072255e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 120569450.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 1456
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.504924634842639e-09,
|
|
"advantages/std": 0.5726876854896545,
|
|
"advantages/var": 0.3279711851114975,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.186379928315413,
|
|
"grad_norm": 0.14289770413103803,
|
|
"learning_rate": 9.325385390368366e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 120652604.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.1633366346359253,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1457
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.2649061754629257e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.189247311827957,
|
|
"grad_norm": 0.09349152460241286,
|
|
"learning_rate": 9.314159330795729e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 120738338.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1458
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.192114695340502,
|
|
"grad_norm": 0.04832040714860593,
|
|
"learning_rate": 9.302934139567689e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 120825619.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1459
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983562397524497e-09,
|
|
"advantages/std": 0.4675827622413635,
|
|
"advantages/var": 0.2186336395452635,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.194982078853046,
|
|
"grad_norm": 0.09409055244967295,
|
|
"learning_rate": 9.291709830896485e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 120902775.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 7.126037115417672e-09,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.197849462365592,
|
|
"grad_norm": 0.17863877682661655,
|
|
"learning_rate": 9.280486418993253e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 120988258.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1461
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.200716845878136,
|
|
"grad_norm": 0.09966702242066987,
|
|
"learning_rate": 9.269263918067969e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 121073667.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1462
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.203584229390681,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.258042342329479e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 121145750.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1463
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 3.4497441148988883e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.2064516129032254,
|
|
"grad_norm": 0.08595277767135193,
|
|
"learning_rate": 9.246821705985446e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 121223289.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1464
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983539800525091e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.209318996415771,
|
|
"grad_norm": 0.10545558084009203,
|
|
"learning_rate": 9.235602023242348e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 121300497.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966859224177393e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.2121863799283155,
|
|
"grad_norm": 0.1166465405118057,
|
|
"learning_rate": 9.224383308305446e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 121385926.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1466
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 5.749701315113695e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.21505376344086,
|
|
"grad_norm": 0.12702730738938423,
|
|
"learning_rate": 9.213165575378792e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 121458830.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1467
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983539546627677e-09,
|
|
"advantages/std": 0.4675854444503784,
|
|
"advantages/var": 0.21863614786185792,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.217921146953405,
|
|
"grad_norm": 0.18718106551736144,
|
|
"learning_rate": 9.201948838665182e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 121542658.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1468
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998248400596406e-09,
|
|
"advantages/std": 0.4049536883831024,
|
|
"advantages/var": 0.16398748973507882,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 4.22078853046595,
|
|
"grad_norm": 0.07568000791039003,
|
|
"learning_rate": 9.190733112366156e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 121639487.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1469
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876548503938182e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.223655913978495,
|
|
"grad_norm": 0.11969325026531555,
|
|
"learning_rate": 9.17951841068197e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 121707906.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299846843318419e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.226523297491039,
|
|
"grad_norm": 0.07745000096218684,
|
|
"learning_rate": 9.168304747811587e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 121778917.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1471
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 5.6336549553791566e-09,
|
|
"advantages/std": 0.6612563729286194,
|
|
"advantages/var": 0.43725999073871336,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.229390681003585,
|
|
"grad_norm": 0.15047688601036804,
|
|
"learning_rate": 9.157092137952656e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 121857601.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.18361148238182068,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1472
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.975067111642947e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 4.232258064516129,
|
|
"grad_norm": 0.09297168256476258,
|
|
"learning_rate": 9.145880595301493e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 121933925.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1473
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917335935919526e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.235125448028674,
|
|
"grad_norm": 0.10083035649293086,
|
|
"learning_rate": 9.134670134053054e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 122011969.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1474
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.237992831541218,
|
|
"grad_norm": 0.12954753409390488,
|
|
"learning_rate": 9.123460768400933e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 122081124.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.1292129760556737e-08,
|
|
"advantages/std": 0.6185652613639832,
|
|
"advantages/var": 0.3826229825662928,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.240860215053764,
|
|
"grad_norm": 0.17851557977755236,
|
|
"learning_rate": 9.112252512537341e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 122169256.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.1820138692855835,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 1476
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599614475511504e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.243727598566308,
|
|
"grad_norm": 0.06532798513154459,
|
|
"learning_rate": 9.101045380653074e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 122251469.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.08785156905651093,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1477
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.246594982078853,
|
|
"grad_norm": 0.09513417861363471,
|
|
"learning_rate": 9.089839386937516e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 122332731.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1478
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.1950488391937882e-08,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.2494623655913975,
|
|
"grad_norm": 0.17134657471060075,
|
|
"learning_rate": 9.078634545578597e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 122405488.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 1479
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.96693437436781e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 4.252329749103943,
|
|
"grad_norm": 0.12659718453337696,
|
|
"learning_rate": 9.067430870762795e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 122493552.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 4.2551971326164875,
|
|
"grad_norm": 0.04849278320959078,
|
|
"learning_rate": 9.056228376675117e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 122576239.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1481
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.258064516129032,
|
|
"grad_norm": 0.06914980251434022,
|
|
"learning_rate": 9.045027077499066e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 122664171.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1482
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.2997782960918235e-09,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.260931899641577,
|
|
"grad_norm": 0.07948991734477566,
|
|
"learning_rate": 9.033826987416632e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 122752107.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1483
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344456541825744e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.263799283154122,
|
|
"grad_norm": 0.1428955902481616,
|
|
"learning_rate": 9.02262812060828e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 122828380.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.12415502220392227,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1484
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.266666666666667,
|
|
"grad_norm": 0.07377116415118819,
|
|
"learning_rate": 9.011430491252923e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 122899534.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.269534050179211,
|
|
"grad_norm": 0.0768415894619512,
|
|
"learning_rate": 9.000234113527911e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 122974114.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1486
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199317639730369e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.272401433691757,
|
|
"grad_norm": 0.17149827069623058,
|
|
"learning_rate": 8.989039001609e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 123047232.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1487
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917480653229804e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.275268817204301,
|
|
"grad_norm": 0.10217273379462666,
|
|
"learning_rate": 8.977845169670352e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 123120772.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1488
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299897960206267e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.278136200716846,
|
|
"grad_norm": 0.10975235544047096,
|
|
"learning_rate": 8.966652631884504e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 123197304.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1489
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.28100358422939,
|
|
"grad_norm": 0.03228892691985338,
|
|
"learning_rate": 8.955461402422364e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 123274259.0,
|
|
"reward": 0.9921875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9921875,
|
|
"rewards/drgrpo_math_reward/std": 0.0883883461356163,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.9752007807758586e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.283870967741936,
|
|
"grad_norm": 0.11509669919237757,
|
|
"learning_rate": 8.944271495453166e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 123361614.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1491
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.28673835125448,
|
|
"grad_norm": 0.07993581383478614,
|
|
"learning_rate": 8.933082925144485e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 123450796.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1492
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.289605734767025,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.921895705662193e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 123521096.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1493
|
|
},
|
|
{
|
|
"advantages/mean": 5.122274160385132e-09,
|
|
"advantages/snr": 7.745951594996348e-09,
|
|
"advantages/std": 0.6612840294837952,
|
|
"advantages/var": 0.4372965676503249,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.29247311827957,
|
|
"grad_norm": 0.13912726542928147,
|
|
"learning_rate": 8.910709851170467e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 123610144.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.22119548916816711,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 1494
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 1.1266523706756892e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.295340501792115,
|
|
"grad_norm": 0.05994487449854081,
|
|
"learning_rate": 8.899525375831731e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 123687905.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.9792225930357895e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.2982078853046595,
|
|
"grad_norm": 0.1009806286692767,
|
|
"learning_rate": 8.888342293806689e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 123773636.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.11678344756364822,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1496
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.301075268817204,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.877160619254264e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 123842388.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1497
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.3039426523297495,
|
|
"grad_norm": 0.09998205361814819,
|
|
"learning_rate": 8.865980366331606e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 123926373.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1498
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 8.049402934764073e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.306810035842294,
|
|
"grad_norm": 0.0875716518529315,
|
|
"learning_rate": 8.854801549194054e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 124000519.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1499
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299846843318419e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.309677419354839,
|
|
"grad_norm": 0.07652125021746135,
|
|
"learning_rate": 8.843624181995144e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 124092098.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917003347966285e-09,
|
|
"advantages/std": 0.4676017463207245,
|
|
"advantages/var": 0.21865139316219118,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.312544802867383,
|
|
"grad_norm": 0.09695003076082673,
|
|
"learning_rate": 8.832448278886566e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 124169758.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.11336849629878998,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1501
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.315412186379929,
|
|
"grad_norm": 0.07080920782055285,
|
|
"learning_rate": 8.821273854018162e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 124248394.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1502
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.318279569892473,
|
|
"grad_norm": 0.06631141915455245,
|
|
"learning_rate": 8.810100921537893e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 124313055.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1503
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.907227504745508e-10,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.321146953405018,
|
|
"grad_norm": 0.12112345623153241,
|
|
"learning_rate": 8.798929495591839e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 124401221.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1504
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958849501312727e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.324014336917562,
|
|
"grad_norm": 0.14301317683757267,
|
|
"learning_rate": 8.787759590324175e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 124470343.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.326881720430108,
|
|
"grad_norm": 0.061120960977824365,
|
|
"learning_rate": 8.776591219877145e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 124550338.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1506
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.5056415302478586e-09,
|
|
"advantages/std": 0.6185553073883057,
|
|
"advantages/var": 0.3826106682982413,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.329749103942652,
|
|
"grad_norm": 0.1754009073013372,
|
|
"learning_rate": 8.765424398391046e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 124638848.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.16834919154644012,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 1507
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.379887186086637e-08,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.332616487455197,
|
|
"grad_norm": 0.08519214614169628,
|
|
"learning_rate": 8.75425914000422e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 124722132.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1508
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.494855892459685e-08,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.335483870967742,
|
|
"grad_norm": 0.08760196360323062,
|
|
"learning_rate": 8.743095458853032e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 124800189.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1509
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.338351254480287,
|
|
"grad_norm": 0.05702860630164658,
|
|
"learning_rate": 8.731933369071849e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 124876940.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9875550720364307e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.3412186379928315,
|
|
"grad_norm": 0.08605429619390997,
|
|
"learning_rate": 8.720772884793015e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 124960083.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1511
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 7.967066906198935e-09,
|
|
"advantages/std": 0.4675861597061157,
|
|
"advantages/var": 0.21863681674871316,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.344086021505376,
|
|
"grad_norm": 0.11213886305841794,
|
|
"learning_rate": 8.70961402014685e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 125040014.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1512
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344384639658041e-09,
|
|
"advantages/std": 0.5227856636047363,
|
|
"advantages/var": 0.27330485007064453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 4.3469534050179215,
|
|
"grad_norm": 0.14903303408755642,
|
|
"learning_rate": 8.698456789261616e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 125121396.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.13098981976509094,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1513
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1498891480459116e-08,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.349820788530466,
|
|
"grad_norm": 0.07735068537130135,
|
|
"learning_rate": 8.687301206263518e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 125199687.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1514
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.0954489382432772e-08,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.352688172043011,
|
|
"grad_norm": 0.13479114473642162,
|
|
"learning_rate": 8.676147285276667e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 125272048.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.5055885760084386e-09,
|
|
"advantages/std": 0.6185770630836487,
|
|
"advantages/var": 0.3826375829731923,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.355555555555555,
|
|
"grad_norm": 0.14266694682150755,
|
|
"learning_rate": 8.664995040423067e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 125363968.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.19568344950675964,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1516
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.358422939068101,
|
|
"grad_norm": 0.12138187059214987,
|
|
"learning_rate": 8.653844485822602e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 125438839.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1517
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.2525469477123842e-09,
|
|
"advantages/std": 0.5726727843284607,
|
|
"advantages/var": 0.32795411791051166,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 4.361290322580645,
|
|
"grad_norm": 0.11781333290531952,
|
|
"learning_rate": 8.642695635593023e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 125529165.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1462521106004715,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1518
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5629381233631915e-09,
|
|
"advantages/std": 0.5227834582328796,
|
|
"advantages/var": 0.273302544201929,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 4.36415770609319,
|
|
"grad_norm": 0.11450533395385713,
|
|
"learning_rate": 8.631548503849915e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 125613193.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.13098490238189697,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1519
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.367025089605734,
|
|
"grad_norm": 0.1717613200259846,
|
|
"learning_rate": 8.620403104706686e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 125680840.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380984365940094,
|
|
"advantages/var": 0.05466704299203351,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.36989247311828,
|
|
"grad_norm": 0.05959738383725868,
|
|
"learning_rate": 8.609259452274557e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 125751046.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 1521
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.372759856630824,
|
|
"grad_norm": 0.1400111794092891,
|
|
"learning_rate": 8.598117560662533e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 125825873.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 1522
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 4.375627240143369,
|
|
"grad_norm": 0.041986627317893196,
|
|
"learning_rate": 8.586977443977396e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 125904104.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1523
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917033813576203e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 4.378494623655914,
|
|
"grad_norm": 0.11661446312178016,
|
|
"learning_rate": 8.575839116323669e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 125996254.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1524
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 6.971194650918909e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 4.381362007168459,
|
|
"grad_norm": 0.09182377957203804,
|
|
"learning_rate": 8.564702591803619e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 126077138.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.9875832530345343e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.3842293906810035,
|
|
"grad_norm": 0.15097119268491282,
|
|
"learning_rate": 8.553567884517226e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 126145191.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1526
|
|
},
|
|
{
|
|
"advantages/mean": -7.916241884231567e-09,
|
|
"advantages/snr": 1.6929100023819388e-08,
|
|
"advantages/std": 0.46761149168014526,
|
|
"advantages/var": 0.21866050715133056,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.387096774193548,
|
|
"grad_norm": 0.12967105364753814,
|
|
"learning_rate": 8.54243500856218e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 126226663.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.12255740165710449,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1527
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814751550759118e-09,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.3899641577060935,
|
|
"grad_norm": 0.11527739695198473,
|
|
"learning_rate": 8.531303978033829e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 126298221.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1528
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.392831541218638,
|
|
"grad_norm": 0.1883962631871225,
|
|
"learning_rate": 8.520174807025209e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 126364653.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1529
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.979258453394051e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 4.395698924731183,
|
|
"grad_norm": 0.09878037298414231,
|
|
"learning_rate": 8.509047509626987e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 126458094.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 4.398566308243727,
|
|
"grad_norm": 0.03007972669000544,
|
|
"learning_rate": 8.497922099927468e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 126532719.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1531
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199591840825068e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.401433691756273,
|
|
"grad_norm": 0.13238509731892936,
|
|
"learning_rate": 8.486798592012552e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 126608679.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1532
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 9.958683201273463e-10,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.404301075268817,
|
|
"grad_norm": 0.0757346675969034,
|
|
"learning_rate": 8.475676999965746e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 126690558.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1533
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.6721682514236524e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.407168458781362,
|
|
"grad_norm": 0.11811042309308818,
|
|
"learning_rate": 8.464557337868126e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 126767154.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 1534
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.5628345390257e-09,
|
|
"advantages/std": 0.5227986574172974,
|
|
"advantages/var": 0.27331843619732865,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.410035842293907,
|
|
"grad_norm": 0.13639475877953225,
|
|
"learning_rate": 8.453439619798324e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 126844983.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.14677615463733673,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.412903225806452,
|
|
"grad_norm": 0.08356876229843305,
|
|
"learning_rate": 8.442323859832508e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 126926652.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1536
|
|
},
|
|
{
|
|
"advantages/mean": 4.6566128730773926e-09,
|
|
"advantages/snr": 9.95862671130252e-09,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.415770609318996,
|
|
"grad_norm": 0.11748319686103294,
|
|
"learning_rate": 8.43121007204437e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 127001488.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1537
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.63344972278721e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.418637992831541,
|
|
"grad_norm": 0.0793231963490384,
|
|
"learning_rate": 8.420098270505108e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 127072722.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1538
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.421505376344086,
|
|
"grad_norm": 0.055645089131594244,
|
|
"learning_rate": 8.408988469283402e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 127144584.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1539
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.424372759856631,
|
|
"grad_norm": 0.056100176748938295,
|
|
"learning_rate": 8.397880682445396e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 127219968.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.4272401433691755,
|
|
"grad_norm": 0.10463434059866998,
|
|
"learning_rate": 8.386774924054685e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 127296150.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1541
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.43010752688172,
|
|
"grad_norm": 0.05977362799968743,
|
|
"learning_rate": 8.375671208172304e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 127361298.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 1542
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46761220693588257,
|
|
"advantages/var": 0.21866117607544666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.4329749103942655,
|
|
"grad_norm": 0.09174086666333871,
|
|
"learning_rate": 8.364569548856694e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 127449389.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.1236182302236557,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1543
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.2674839170101503e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.43584229390681,
|
|
"grad_norm": 0.0755987201652617,
|
|
"learning_rate": 8.353469960163689e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 127519611.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1544
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.4948445614343708e-08,
|
|
"advantages/std": 0.40496495366096497,
|
|
"advantages/var": 0.1639966136936275,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.438709677419355,
|
|
"grad_norm": 0.08453683000937354,
|
|
"learning_rate": 8.34237245614651e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 127606813.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.0936255231499672,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907189930094833e-09,
|
|
"advantages/std": 0.5227925777435303,
|
|
"advantages/var": 0.27331207934372515,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.4415770609319,
|
|
"grad_norm": 0.13134660568169892,
|
|
"learning_rate": 8.331277050855732e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 127685029.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.13782459497451782,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1546
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.444444444444445,
|
|
"grad_norm": 0.05309177452297175,
|
|
"learning_rate": 8.320183758339283e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 127762373.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 1547
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.447311827956989,
|
|
"grad_norm": 0.13427940408964043,
|
|
"learning_rate": 8.309092592642401e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 127838504.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1548
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 4.450179211469534,
|
|
"grad_norm": 0.11582365325455868,
|
|
"learning_rate": 8.29800356780764e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 127928687.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1549
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.453046594982079,
|
|
"grad_norm": 0.053111368806384486,
|
|
"learning_rate": 8.286916697874841e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 128004872.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262492693233955e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.455913978494624,
|
|
"grad_norm": 0.11176594266265587,
|
|
"learning_rate": 8.275831996881127e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 128095545.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1551
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975244195968941e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.458781362007168,
|
|
"grad_norm": 0.13524869902232045,
|
|
"learning_rate": 8.264749478860853e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 128170833.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1552
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.461648745519713,
|
|
"grad_norm": 0.07916711345930293,
|
|
"learning_rate": 8.253669157845631e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 128235550.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1553
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.464516129032258,
|
|
"grad_norm": 0.09645282716884043,
|
|
"learning_rate": 8.24259104786428e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 128310662.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1554
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.467383512544803,
|
|
"grad_norm": 0.07946594741567561,
|
|
"learning_rate": 8.231515162942822e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 128380371.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.4702508960573475,
|
|
"grad_norm": 0.08806504776261509,
|
|
"learning_rate": 8.220441517104471e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 128463092.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1556
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.4083154633446115e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.473118279569892,
|
|
"grad_norm": 0.05649365302533599,
|
|
"learning_rate": 8.209370124369588e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 128546682.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1557
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.0954106789488524e-08,
|
|
"advantages/std": 0.46761220693588257,
|
|
"advantages/var": 0.21866117607544666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.4759856630824375,
|
|
"grad_norm": 0.10687162860127353,
|
|
"learning_rate": 8.198300998755696e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 128628640.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.1236182302236557,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1558
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962614376833355e-09,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.478853046594982,
|
|
"grad_norm": 0.10378706897507785,
|
|
"learning_rate": 8.187234154277439e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 128711989.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1559
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.971194650918909e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.481720430107527,
|
|
"grad_norm": 0.1279554688962821,
|
|
"learning_rate": 8.176169604946586e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 128790667.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.484587813620072,
|
|
"grad_norm": 0.13033485611719636,
|
|
"learning_rate": 8.165107364771978e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 128875084.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1561
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.6612711548805237,
|
|
"advantages/var": 0.43727954027702154,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.487455197132617,
|
|
"grad_norm": 0.13625062308115246,
|
|
"learning_rate": 8.154047447759554e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 128968539.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.20411096513271332,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1562
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.490322580645161,
|
|
"grad_norm": 0.09146346001930003,
|
|
"learning_rate": 8.142989867912298e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 129038631.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1563
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299846843318419e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.493189964157706,
|
|
"grad_norm": 0.14823129251144393,
|
|
"learning_rate": 8.131934639230244e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 129117638.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1564
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5629503101518235e-09,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.496057347670251,
|
|
"grad_norm": 0.09899485072652606,
|
|
"learning_rate": 8.12088177571044e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 129204605.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.12863080203533173,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.258373724324303e-09,
|
|
"advantages/std": 0.6185795664787292,
|
|
"advantages/var": 0.3826406800650126,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.498924731182796,
|
|
"grad_norm": 0.13593794888110886,
|
|
"learning_rate": 8.109831291346948e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 129289557.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.2001592218875885,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1566
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.50179211469534,
|
|
"grad_norm": 0.07242409317259144,
|
|
"learning_rate": 8.098783200130812e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 129369624.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1567
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.9835626514248234e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.504659498207886,
|
|
"grad_norm": 0.09399296263010991,
|
|
"learning_rate": 8.087737516050053e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 129446421.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 1568
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.50752688172043,
|
|
"grad_norm": 0.09191536758570197,
|
|
"learning_rate": 8.076694253089631e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 129515430.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1569
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.626265687704098e-09,
|
|
"advantages/std": 0.5726755261421204,
|
|
"advantages/var": 0.3279572582421544,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.510394265232975,
|
|
"grad_norm": 0.1522348482437637,
|
|
"learning_rate": 8.065653425231452e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 129598753.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.15072788298130035,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.5132616487455195,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 8.05461504645434e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 129669733.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1571
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917366402546925e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.516129032258064,
|
|
"grad_norm": 0.1251916490484711,
|
|
"learning_rate": 8.043579130734013e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 129741109.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1572
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.2524951534513563e-09,
|
|
"advantages/std": 0.5726819038391113,
|
|
"advantages/var": 0.32796456298478915,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.5189964157706095,
|
|
"grad_norm": 0.14549734052264987,
|
|
"learning_rate": 8.032545692043068e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 129831912.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.15756267309188843,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1573
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.979139449767511e-09,
|
|
"advantages/std": 0.46761220693588257,
|
|
"advantages/var": 0.21866117607544666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.521863799283154,
|
|
"grad_norm": 0.11984844576041742,
|
|
"learning_rate": 8.021514744350969e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 129911905.0,
|
|
"reward": 0.6171875,
|
|
"reward_std": 0.1236182302236557,
|
|
"rewards/drgrpo_math_reward/mean": 0.6171875,
|
|
"rewards/drgrpo_math_reward/std": 0.4879830479621887,
|
|
"step": 1574
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 4.524731182795699,
|
|
"grad_norm": 0.1081565198875998,
|
|
"learning_rate": 8.010486301624032e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 130005519.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998026679709143e-09,
|
|
"advantages/std": 0.4049575924873352,
|
|
"advantages/var": 0.16399065171313865,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.527598566308244,
|
|
"grad_norm": 0.1116032107301051,
|
|
"learning_rate": 7.999460377825395e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 130082566.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.08891239762306213,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1576
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.971194650918909e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.530465949820789,
|
|
"grad_norm": 0.1638845893683361,
|
|
"learning_rate": 7.988436986915003e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 130165929.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1577
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983473280509385e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.533333333333333,
|
|
"grad_norm": 0.13295662249426007,
|
|
"learning_rate": 7.977416142849605e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 130241029.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1578
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40496885776519775,
|
|
"advantages/var": 0.16399977575964897,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.536200716845878,
|
|
"grad_norm": 0.10972079286238992,
|
|
"learning_rate": 7.966397859582712e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 130322199.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.09810130298137665,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1579
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.1258055642951985e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 4.539068100358423,
|
|
"grad_norm": 0.12422660645392816,
|
|
"learning_rate": 7.955382151064609e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 130409780.0,
|
|
"reward": 0.609375,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.609375,
|
|
"rewards/drgrpo_math_reward/std": 0.4898075461387634,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.6262611176060706e-09,
|
|
"advantages/std": 0.5726771354675293,
|
|
"advantages/var": 0.3279591014872949,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.541935483870968,
|
|
"grad_norm": 0.0780531493375495,
|
|
"learning_rate": 7.944369031242306e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 130495116.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1530819982290268,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1581
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.544802867383512,
|
|
"grad_norm": 0.09287131038371223,
|
|
"learning_rate": 7.933358514059542e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 130574069.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1582
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.547670250896058,
|
|
"grad_norm": 0.16809308550778532,
|
|
"learning_rate": 7.922350613456763e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 130654253.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1583
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.550537634408602,
|
|
"grad_norm": 0.06683962956290307,
|
|
"learning_rate": 7.911345343371103e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 130733105.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1584
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.553405017921147,
|
|
"grad_norm": 0.09373972474736782,
|
|
"learning_rate": 7.900342717736353e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 130822672.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.5562724014336915,
|
|
"grad_norm": 0.10264809816037185,
|
|
"learning_rate": 7.88934275048297e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 130897567.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 1586
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.559139784946236,
|
|
"grad_norm": 0.10525879291354667,
|
|
"learning_rate": 7.878345455538043e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 130975683.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1587
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494468808174133,
|
|
"advantages/var": 0.16398020040561878,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.5620071684587815,
|
|
"grad_norm": 0.09257790509906005,
|
|
"learning_rate": 7.867350846825271e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 131055145.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1588
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199591840825068e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.564874551971326,
|
|
"grad_norm": 0.1446030853921875,
|
|
"learning_rate": 7.856358938264953e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 131133287.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1589
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 1.126665165824288e-08,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.567741935483871,
|
|
"grad_norm": 0.06268478241855875,
|
|
"learning_rate": 7.84536974377398e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 131197738.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.2998246708054356e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.570609318996416,
|
|
"grad_norm": 0.09331097190519154,
|
|
"learning_rate": 7.834383277265792e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 131275636.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.08443662524223328,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1591
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983628920516591e-09,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.573476702508961,
|
|
"grad_norm": 0.18006050372622187,
|
|
"learning_rate": 7.823399552650383e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 131361700.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1592
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983467187183905e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.576344086021505,
|
|
"grad_norm": 0.16944149752660384,
|
|
"learning_rate": 7.812418583834281e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 131434842.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1593
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917221686896894e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.57921146953405,
|
|
"grad_norm": 0.1036148638309728,
|
|
"learning_rate": 7.801440384720509e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 131513412.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1594
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.96706741399221e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.582078853046595,
|
|
"grad_norm": 0.16731095967597007,
|
|
"learning_rate": 7.790464969208597e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 131591086.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.5050938954247684e-09,
|
|
"advantages/std": 0.5726727843284607,
|
|
"advantages/var": 0.32795411791051166,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.58494623655914,
|
|
"grad_norm": 0.14980625968664033,
|
|
"learning_rate": 7.779492351194546e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 131676647.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1462521106004715,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1596
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227833986282349,
|
|
"advantages/var": 0.2733024818812879,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 4.587813620071684,
|
|
"grad_norm": 0.25115726970915364,
|
|
"learning_rate": 7.768522544570817e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 131753930.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.13098490238189697,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1597
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.2998806953079044e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.59068100358423,
|
|
"grad_norm": 0.07780192650417037,
|
|
"learning_rate": 7.757555563226305e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 131837580.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1598
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917699002625455e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.593548387096774,
|
|
"grad_norm": 0.0806428245538292,
|
|
"learning_rate": 7.746591421046335e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 131916844.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.09863808006048203,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1599
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 4.596415770609319,
|
|
"grad_norm": 0.0723027089703405,
|
|
"learning_rate": 7.735630131912637e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 132008689.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.5992831541218635,
|
|
"grad_norm": 0.08158367358736894,
|
|
"learning_rate": 7.724671709703326e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 132073467.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1601
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 7.528079978419126e-09,
|
|
"advantages/std": 0.6185657978057861,
|
|
"advantages/var": 0.3826236462151087,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.602150537634409,
|
|
"grad_norm": 0.14388181305992928,
|
|
"learning_rate": 7.713716168292887e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 132155281.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.1830746978521347,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1602
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.2674839170101503e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.6050179211469535,
|
|
"grad_norm": 0.06219203290418096,
|
|
"learning_rate": 7.702763521552153e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 132241545.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1603
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.5995914574054795e-09,
|
|
"advantages/std": 0.4049588143825531,
|
|
"advantages/var": 0.1639916413461231,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.607885304659498,
|
|
"grad_norm": 0.09435091549837735,
|
|
"learning_rate": 7.691813783348308e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 132321404.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.0867956355214119,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1604
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907308748317195e-09,
|
|
"advantages/std": 0.5227856040000916,
|
|
"advantages/var": 0.27330478774974054,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 4.610752688172043,
|
|
"grad_norm": 0.14888850594239775,
|
|
"learning_rate": 7.68086696754484e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 132400975.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.13098981976509094,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599693686636838e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.613620071684588,
|
|
"grad_norm": 0.10091741032199689,
|
|
"learning_rate": 7.669923088001537e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 132481527.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1606
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814615465526806e-09,
|
|
"advantages/std": 0.5227856636047363,
|
|
"advantages/var": 0.27330485007064453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.616487455197133,
|
|
"grad_norm": 0.12915255405954504,
|
|
"learning_rate": 7.658982158574469e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 132564893.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.13098981976509094,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 1607
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.29459053084411e-08,
|
|
"advantages/std": 0.4676070511341095,
|
|
"advantages/var": 0.2186563542703377,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.619354838709677,
|
|
"grad_norm": 0.10581661925296995,
|
|
"learning_rate": 7.648044193115983e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 132640397.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.12019838392734528,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1608
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.2470340303516295e-08,
|
|
"advantages/std": 0.5227810740470886,
|
|
"advantages/var": 0.27330005138182756,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.622222222222222,
|
|
"grad_norm": 0.12392200488142388,
|
|
"learning_rate": 7.637109205474663e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 132725080.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1609
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.625089605734767,
|
|
"grad_norm": 0.10554181629616402,
|
|
"learning_rate": 7.626177209495319e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 132789271.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.627956989247312,
|
|
"grad_norm": 0.086996552301831,
|
|
"learning_rate": 7.615248219018981e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 132871722.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1611
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.149940263022739e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.630824372759856,
|
|
"grad_norm": 0.14215844478089376,
|
|
"learning_rate": 7.60432224788287e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 132946368.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1612
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 4.599795920412534e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.633691756272402,
|
|
"grad_norm": 0.14701246559534492,
|
|
"learning_rate": 7.593399309920393e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 133021640.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1613
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917480653229804e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.636559139784946,
|
|
"grad_norm": 0.11497633964923037,
|
|
"learning_rate": 7.582479418961101e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 133097958.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1614
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 4.639426523297491,
|
|
"grad_norm": 0.05788553387589012,
|
|
"learning_rate": 7.571562588830697e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 133191439.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 3.763983771436959e-09,
|
|
"advantages/std": 0.6185750365257263,
|
|
"advantages/var": 0.38263507581280365,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.6422939068100355,
|
|
"grad_norm": 0.14491687864547192,
|
|
"learning_rate": 7.560648833351007e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 133274277.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.1922685205936432,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 1616
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.645161290322581,
|
|
"grad_norm": 0.12258378917704145,
|
|
"learning_rate": 7.54973816633997e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 133346144.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1617
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599521727490371e-09,
|
|
"advantages/std": 0.40496495366096497,
|
|
"advantages/var": 0.1639966136936275,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.6480286738351255,
|
|
"grad_norm": 0.07265957256575328,
|
|
"learning_rate": 7.538830601611599e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 133431302.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0936255231499672,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1618
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.65089605734767,
|
|
"grad_norm": 0.08736898389564368,
|
|
"learning_rate": 7.527926152975999e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 133508245.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1619
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.6537634408602155,
|
|
"grad_norm": 0.027703098343415788,
|
|
"learning_rate": 7.517024834239311e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 133594592.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 5.749701315113695e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.65663082437276,
|
|
"grad_norm": 0.08427394693962005,
|
|
"learning_rate": 7.506126659203732e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 133669055.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 1621
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.659498207885305,
|
|
"grad_norm": 0.05510938332542658,
|
|
"learning_rate": 7.495231641667458e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 133751963.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1622
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 4.662365591397849,
|
|
"grad_norm": 0.05035489070551885,
|
|
"learning_rate": 7.484339795424705e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 133827487.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1623
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262673803395154e-09,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.665232974910394,
|
|
"grad_norm": 0.15364144928913698,
|
|
"learning_rate": 7.473451134265665e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 133915334.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1624
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.668100358422939,
|
|
"grad_norm": 0.10363090295376025,
|
|
"learning_rate": 7.462565671976503e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 133989607.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.670967741935484,
|
|
"grad_norm": 0.07421035931391864,
|
|
"learning_rate": 7.451683422339323e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 134060459.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1626
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.673835125448028,
|
|
"grad_norm": 0.13227855390169405,
|
|
"learning_rate": 7.440804399132172e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 134126741.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1627
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.5049727039524085e-09,
|
|
"advantages/std": 0.5726834535598755,
|
|
"advantages/var": 0.32796633798126607,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 4.676702508960574,
|
|
"grad_norm": 0.19359569030053653,
|
|
"learning_rate": 7.429928616129009e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 134215282.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1628
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 5.63344972278721e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.679569892473118,
|
|
"grad_norm": 0.05907275427514059,
|
|
"learning_rate": 7.419056087099694e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 134300129.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1629
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.4393256658538594e-09,
|
|
"advantages/std": 0.5726926326751709,
|
|
"advantages/var": 0.3279768515204182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.682437275985663,
|
|
"grad_norm": 0.1537068880657909,
|
|
"learning_rate": 7.408186825809957e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 134374852.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.1712273508310318,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.6853046594982075,
|
|
"grad_norm": 0.13636365349874846,
|
|
"learning_rate": 7.397320846021397e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 134449949.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1631
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299802498719973e-09,
|
|
"advantages/std": 0.4049576222896576,
|
|
"advantages/var": 0.16399067585049298,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.688172043010753,
|
|
"grad_norm": 0.12425002393391864,
|
|
"learning_rate": 7.386458161491465e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 134523285.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.08891239762306213,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1632
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.6910394265232975,
|
|
"grad_norm": 0.09466145442952983,
|
|
"learning_rate": 7.375598785973429e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 134600090.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1633
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633180108710322e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.693906810035842,
|
|
"grad_norm": 0.11381714746002539,
|
|
"learning_rate": 7.364742733216372e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 134673298.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1634
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.979139449767511e-09,
|
|
"advantages/std": 0.46761220693588257,
|
|
"advantages/var": 0.21866117607544666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.6967741935483875,
|
|
"grad_norm": 0.10872391394227172,
|
|
"learning_rate": 7.353890016965169e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 134748297.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.1236182302236557,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.699641577060932,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.343040650960469e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 134823428.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1636
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998246708054356e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.702508960573477,
|
|
"grad_norm": 0.064011796942797,
|
|
"learning_rate": 7.332194648938688e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 134903274.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1637
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.705376344086021,
|
|
"grad_norm": 0.12022436045089688,
|
|
"learning_rate": 7.321352024631973e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 134986570.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1638
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450174584180815e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.708243727598567,
|
|
"grad_norm": 0.07791697208110278,
|
|
"learning_rate": 7.310512791768198e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 135070456.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1639
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.711111111111111,
|
|
"grad_norm": 0.08557264076533448,
|
|
"learning_rate": 7.299676964070938e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 135140402.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 5.749701315113695e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.713978494623656,
|
|
"grad_norm": 0.07883140905174156,
|
|
"learning_rate": 7.288844555259471e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 135224668.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1641
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 8.049325332145131e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.7168458781362,
|
|
"grad_norm": 0.110481504205288,
|
|
"learning_rate": 7.278015579048734e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 135301935.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1642
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.0954368597466913e-08,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.719713261648746,
|
|
"grad_norm": 0.10336929955002645,
|
|
"learning_rate": 7.267190049149317e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 135384382.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1643
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9875211793780565e-09,
|
|
"advantages/std": 0.4676063358783722,
|
|
"advantages/var": 0.21865568535359703,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.72258064516129,
|
|
"grad_norm": 0.09791630050111359,
|
|
"learning_rate": 7.256367979267455e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 135467754.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.11913755536079407,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1644
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 6.970961834751672e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.725448028673835,
|
|
"grad_norm": 0.10299045133209574,
|
|
"learning_rate": 7.245549383104992e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 135553008.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.8994882297977766e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 4.7283154121863795,
|
|
"grad_norm": 0.14588499031325744,
|
|
"learning_rate": 7.234734274359388e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 135639384.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1646
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 7.527839878197673e-10,
|
|
"advantages/std": 0.6185855269432068,
|
|
"advantages/var": 0.3826480541436048,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 4.731182795698925,
|
|
"grad_norm": 0.17317502917605887,
|
|
"learning_rate": 7.223922666723676e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 135719633.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.20699402689933777,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 1647
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.7340501792114695,
|
|
"grad_norm": 0.13800906614522476,
|
|
"learning_rate": 7.213114573886458e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 135794499.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1648
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599693686636838e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.736917562724014,
|
|
"grad_norm": 0.06521309150206973,
|
|
"learning_rate": 7.202310009531884e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 135872734.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1649
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 8.049402934764073e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.7397849462365595,
|
|
"grad_norm": 0.0762940262182169,
|
|
"learning_rate": 7.191508987339654e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 135952386.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.149905988405531e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.742652329749104,
|
|
"grad_norm": 0.12225458348992939,
|
|
"learning_rate": 7.180711520984952e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 136039969.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1651
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.2523980013455208e-09,
|
|
"advantages/std": 0.5726990103721619,
|
|
"advantages/var": 0.32798415648125356,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.745519713261649,
|
|
"grad_norm": 0.15372109186111152,
|
|
"learning_rate": 7.169917624138488e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 136119659.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.17806214094161987,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1652
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.748387096774193,
|
|
"grad_norm": 0.10231956708089485,
|
|
"learning_rate": 7.15912731046644e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 136191006.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1653
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.235142426779239e-09,
|
|
"advantages/std": 0.5227833986282349,
|
|
"advantages/var": 0.2733024818812879,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.751254480286739,
|
|
"grad_norm": 0.13484719765167572,
|
|
"learning_rate": 7.148340593630452e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 136271001.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.13098490238189697,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 1654
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.754121863799283,
|
|
"grad_norm": 0.08229723941001783,
|
|
"learning_rate": 7.137557487287607e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 136356037.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.756989247311828,
|
|
"grad_norm": 0.11561987468906357,
|
|
"learning_rate": 7.126778005090431e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 136430606.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1656
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962749759103603e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.759856630824372,
|
|
"grad_norm": 0.20534656263558138,
|
|
"learning_rate": 7.11600216068685e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 136506150.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1657
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.762724014336918,
|
|
"grad_norm": 0.23299202155065596,
|
|
"learning_rate": 7.105229967720191e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 136583371.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1658
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975166886896984e-09,
|
|
"advantages/std": 0.4675965905189514,
|
|
"advantages/var": 0.21864657146494793,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.765591397849462,
|
|
"grad_norm": 0.09342418994880777,
|
|
"learning_rate": 7.09446143982915e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 136666709.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.10994865745306015,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1659
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.9750579720916185e-09,
|
|
"advantages/std": 0.4676051139831543,
|
|
"advantages/var": 0.21865454262319872,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.768458781362007,
|
|
"grad_norm": 0.08349540862040114,
|
|
"learning_rate": 7.083696590647786e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 136756219.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.11784426867961884,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1499234216592094e-08,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.7713261648745515,
|
|
"grad_norm": 0.11801519200360466,
|
|
"learning_rate": 7.072935433805507e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 136841112.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1661
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2997337848958775e-09,
|
|
"advantages/std": 0.404969722032547,
|
|
"advantages/var": 0.16400047576311838,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 4.774193548387097,
|
|
"grad_norm": 0.06475097052472373,
|
|
"learning_rate": 7.06217798292704e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 136919042.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.09916213154792786,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1662
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 6.970911630250105e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 4.7770609318996415,
|
|
"grad_norm": 0.11656949784420355,
|
|
"learning_rate": 7.051424251632418e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 137007284.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 1663
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.779928315412186,
|
|
"grad_norm": 0.0945511456503056,
|
|
"learning_rate": 7.040674253536965e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 137077723.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1664
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917699002625455e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.7827956989247316,
|
|
"grad_norm": 0.12004320795593788,
|
|
"learning_rate": 7.029928002251287e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 137149648.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.09863808006048203,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1499234216592094e-08,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.785663082437276,
|
|
"grad_norm": 0.08839211137410177,
|
|
"learning_rate": 7.019185511381238e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 137223917.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 1666
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380985856056213,
|
|
"advantages/var": 0.05466704996011007,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.788530465949821,
|
|
"grad_norm": 0.03693768958955889,
|
|
"learning_rate": 7.008446794527909e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 137300499.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1667
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.0112319913676506e-09,
|
|
"advantages/std": 0.6185657978057861,
|
|
"advantages/var": 0.3826236462151087,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 4.791397849462365,
|
|
"grad_norm": 0.11623775362396632,
|
|
"learning_rate": 6.99771186528762e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 137387181.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.1830746978521347,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1668
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.794265232974911,
|
|
"grad_norm": 0.13093804686208912,
|
|
"learning_rate": 6.986980737251888e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 137462936.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1669
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 8.962600667464421e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.797132616487455,
|
|
"grad_norm": 0.10717130907933577,
|
|
"learning_rate": 6.976253424007427e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 137538359.0,
|
|
"reward": 0.578125,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4957992732524872,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998246708054356e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.8,
|
|
"grad_norm": 0.10350714272675278,
|
|
"learning_rate": 6.965529939136114e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 137614388.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1671
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.802867383512545,
|
|
"grad_norm": 0.06582431752065288,
|
|
"learning_rate": 6.954810296214976e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 137692045.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1672
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.3444254652277355e-09,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.80573476702509,
|
|
"grad_norm": 0.13532073690127588,
|
|
"learning_rate": 6.944094508816181e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 137781720.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1673
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962814881146116e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.808602150537634,
|
|
"grad_norm": 0.09251980140920273,
|
|
"learning_rate": 6.933382590507016e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 137856287.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1674
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.562997839424082e-09,
|
|
"advantages/std": 0.5227746963500977,
|
|
"advantages/var": 0.2732933831439368,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.811469534050179,
|
|
"grad_norm": 0.08853742787381969,
|
|
"learning_rate": 6.92267455484987e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 137940285.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.12179599702358246,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 4.8143369175627235,
|
|
"grad_norm": 0.07298975448657384,
|
|
"learning_rate": 6.91197041540221e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 138029888.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1676
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.817204301075269,
|
|
"grad_norm": 0.11434426915796067,
|
|
"learning_rate": 6.901270185716575e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 138108212.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 1677
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.96693437436781e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.8200716845878135,
|
|
"grad_norm": 0.08609948110605993,
|
|
"learning_rate": 6.89057387934055e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 138181691.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1678
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.1950134223285894e-08,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.822939068100358,
|
|
"grad_norm": 0.08392182725521677,
|
|
"learning_rate": 6.879881509816763e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 138270242.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1679
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299846843318419e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.825806451612904,
|
|
"grad_norm": 0.07611886420636763,
|
|
"learning_rate": 6.869193090682843e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 138348798.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.828673835125448,
|
|
"grad_norm": 0.0982792481690089,
|
|
"learning_rate": 6.858508635471428e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 138420340.0,
|
|
"reward": 0.9765625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9765625,
|
|
"rewards/drgrpo_math_reward/std": 0.15188287198543549,
|
|
"step": 1681
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.425143889473195e-08,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.831541218637993,
|
|
"grad_norm": 0.10544123657587952,
|
|
"learning_rate": 6.847828157710127e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 138498723.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1682
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.2998246708054356e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.834408602150537,
|
|
"grad_norm": 0.20239531167136152,
|
|
"learning_rate": 6.837151670921533e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 138588563.0,
|
|
"reward": 0.6171875,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.6171875,
|
|
"rewards/drgrpo_math_reward/std": 0.4879830479621887,
|
|
"step": 1683
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.837275985663083,
|
|
"grad_norm": 0.08460167166327032,
|
|
"learning_rate": 6.82647918862316e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 138681623.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1684
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.3360541173152519e-08,
|
|
"advantages/std": 0.5228021144866943,
|
|
"advantages/var": 0.27332205091175865,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.840143369175627,
|
|
"grad_norm": 0.09624207245640586,
|
|
"learning_rate": 6.815810724327468e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 138773832.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.15148437023162842,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633244590331673e-09,
|
|
"advantages/std": 0.33065226674079895,
|
|
"advantages/var": 0.10933092150082846,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.843010752688172,
|
|
"grad_norm": 0.11083136432676015,
|
|
"learning_rate": 6.805146291541831e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 138837254.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.06233953312039375,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 1686
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983378074428632e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.845878136200717,
|
|
"grad_norm": 0.1494266370747206,
|
|
"learning_rate": 6.794485903768512e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 138919092.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1687
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344431558649841e-09,
|
|
"advantages/std": 0.5227810740470886,
|
|
"advantages/var": 0.27330005138182756,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 4.848745519713262,
|
|
"grad_norm": 0.1249035905371772,
|
|
"learning_rate": 6.78382957450465e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 139003387.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.12756997346878052,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1688
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 4.599649341610871e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.851612903225806,
|
|
"grad_norm": 0.10591973747412967,
|
|
"learning_rate": 6.773177317242256e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 139073069.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1689
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.854480286738351,
|
|
"grad_norm": 0.05229478359096736,
|
|
"learning_rate": 6.762529145468179e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 139148797.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.857347670250896,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.751885072664095e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 139222062.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1691
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.979341600636731e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.860215053763441,
|
|
"grad_norm": 0.1356212755508729,
|
|
"learning_rate": 6.741245112306491e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 139292370.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1692
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.494855892459685e-08,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.8630824372759855,
|
|
"grad_norm": 0.10644838913526689,
|
|
"learning_rate": 6.730609277866644e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 139376718.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1693
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.453574654603735e-09,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.86594982078853,
|
|
"grad_norm": 0.08988747357697519,
|
|
"learning_rate": 6.719977582810617e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 139472108.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 1694
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299829409932592e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 4.868817204301076,
|
|
"grad_norm": 0.11191333046930001,
|
|
"learning_rate": 6.709350040599226e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 139555776.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.87168458781362,
|
|
"grad_norm": 0.09852394791062487,
|
|
"learning_rate": 6.698726664688025e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 139629721.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1696
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.97501037071382e-09,
|
|
"advantages/std": 0.46760883927345276,
|
|
"advantages/var": 0.21865802656666578,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.874551971326165,
|
|
"grad_norm": 0.09887033707653936,
|
|
"learning_rate": 6.688107468527295e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 139718114.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.11914245784282684,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 1697
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.877419354838709,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.677492465562033e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 139788104.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1698
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 6.504868442276958e-09,
|
|
"advantages/std": 0.5726926326751709,
|
|
"advantages/var": 0.3279768515204182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 4.880286738351255,
|
|
"grad_norm": 0.13135190833765129,
|
|
"learning_rate": 6.666881669231921e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 139867176.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.171227365732193,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1699
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.958180526526718e-10,
|
|
"advantages/std": 0.46761682629585266,
|
|
"advantages/var": 0.21866549623500564,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.883154121863799,
|
|
"grad_norm": 0.08760053973741729,
|
|
"learning_rate": 6.656275092971311e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 139951225.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.1293872892856598,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.886021505376344,
|
|
"grad_norm": 0.089052736176109,
|
|
"learning_rate": 6.645672750209214e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 140019646.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1701
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.379866977655094e-08,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.888888888888889,
|
|
"grad_norm": 0.11224937212072089,
|
|
"learning_rate": 6.635074654369286e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 140089387.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1702
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.891756272401434,
|
|
"grad_norm": 0.07986553586560578,
|
|
"learning_rate": 6.624480818869806e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 140162675.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1703
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.894623655913978,
|
|
"grad_norm": 0.11206446301223659,
|
|
"learning_rate": 6.613891257123652e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 140226686.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1704
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.96265150658483e-09,
|
|
"advantages/std": 0.4676017463207245,
|
|
"advantages/var": 0.21865139316219118,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 4.897491039426523,
|
|
"grad_norm": 0.09464547910317243,
|
|
"learning_rate": 6.603305982538294e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 140305406.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.11336849629878998,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380984365940094,
|
|
"advantages/var": 0.05466704299203351,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.900358422939068,
|
|
"grad_norm": 0.08937874680302588,
|
|
"learning_rate": 6.592725008515773e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 140383480.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1706
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 9.958849501312727e-10,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.903225806451613,
|
|
"grad_norm": 0.09765013745455495,
|
|
"learning_rate": 6.582148348452699e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 140467072.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1707
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.8996420859237135e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.9060931899641576,
|
|
"grad_norm": 0.1222620846497803,
|
|
"learning_rate": 6.571576015740191e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 140541660.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1708
|
|
},
|
|
{
|
|
"advantages/mean": 6.51925802230835e-09,
|
|
"advantages/snr": 1.3941743289762473e-08,
|
|
"advantages/std": 0.4676070809364319,
|
|
"advantages/var": 0.21865638214189076,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.908960573476703,
|
|
"grad_norm": 0.09454060755806572,
|
|
"learning_rate": 6.561008023763914e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 140625366.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.12019838392734528,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1709
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599614475511504e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.911827956989248,
|
|
"grad_norm": 0.13555003931504256,
|
|
"learning_rate": 6.550444385904032e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 140695054.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.08785156905651093,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.149914704966296e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 4.914695340501792,
|
|
"grad_norm": 0.09516021365964764,
|
|
"learning_rate": 6.539885115535186e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 140786979.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 1711
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.917562724014337,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.529330226026506e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 140856971.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1712
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.1950488391937882e-08,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.920430107526881,
|
|
"grad_norm": 0.10092452127505862,
|
|
"learning_rate": 6.518779730741554e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 140936874.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1713
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.923297491039427,
|
|
"grad_norm": 0.04411808604439211,
|
|
"learning_rate": 6.508233643038341e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 141012761.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1714
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.926164874551971,
|
|
"grad_norm": 0.0322422152411568,
|
|
"learning_rate": 6.497691976269296e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 141097051.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876049603820392e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.929032258064516,
|
|
"grad_norm": 0.09155685432631067,
|
|
"learning_rate": 6.487154743781256e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 141179243.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1716
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.45367316198517e-09,
|
|
"advantages/std": 0.5227833986282349,
|
|
"advantages/var": 0.2733024818812879,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.931899641577061,
|
|
"grad_norm": 0.10127201522358535,
|
|
"learning_rate": 6.476621958915424e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 141256649.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.13098490238189697,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1717
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 4.934767025089606,
|
|
"grad_norm": 0.05331845694815187,
|
|
"learning_rate": 6.466093635007397e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 141335322.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1718
|
|
},
|
|
{
|
|
"advantages/mean": -7.916241884231567e-09,
|
|
"advantages/snr": 1.5141863799394746e-08,
|
|
"advantages/std": 0.5228049755096436,
|
|
"advantages/var": 0.273325042417639,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.93763440860215,
|
|
"grad_norm": 0.0997539260668846,
|
|
"learning_rate": 6.455569785387105e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 141430219.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.1525501012802124,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1719
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.5995914574054795e-09,
|
|
"advantages/std": 0.4049588143825531,
|
|
"advantages/var": 0.1639916413461231,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.940501792114695,
|
|
"grad_norm": 0.08453368071186579,
|
|
"learning_rate": 6.44505042337883e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 141510843.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.0867956355214119,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.94336917562724,
|
|
"grad_norm": 0.04205308392745947,
|
|
"learning_rate": 6.434535562301152e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 141583915.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1721
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.946236559139785,
|
|
"grad_norm": 0.11643484145917221,
|
|
"learning_rate": 6.424025215466968e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 141655988.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1722
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.971078240891425e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.94910394265233,
|
|
"grad_norm": 0.10430840916517461,
|
|
"learning_rate": 6.413519396183455e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 141731014.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1723
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.951971326164875,
|
|
"grad_norm": 0.1523243992207566,
|
|
"learning_rate": 6.40301811775206e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 141791103.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1724
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917335935919526e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.95483870967742,
|
|
"grad_norm": 0.1217709933519384,
|
|
"learning_rate": 6.392521393468471e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 141865535.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.957706093189964,
|
|
"grad_norm": 0.0736153775168493,
|
|
"learning_rate": 6.382029236622617e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 141943036.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1726
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.4083154633446115e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 4.960573476702509,
|
|
"grad_norm": 0.060291069484120795,
|
|
"learning_rate": 6.371541660498651e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 142030491.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1727
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917668534980524e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.963440860215054,
|
|
"grad_norm": 0.11269864326468705,
|
|
"learning_rate": 6.36105867837492e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 142116698.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1728
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.453692965541534e-09,
|
|
"advantages/std": 0.5227810740470886,
|
|
"advantages/var": 0.27330005138182756,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 4.966308243727599,
|
|
"grad_norm": 0.14868033927371826,
|
|
"learning_rate": 6.350580303523946e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 142195353.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1729
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.878688398451923e-09,
|
|
"advantages/std": 0.5726882815361023,
|
|
"advantages/var": 0.32797186780877396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.969175627240143,
|
|
"grad_norm": 0.13763173975271825,
|
|
"learning_rate": 6.340106549212429e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 142275715.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.1643974632024765,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.149940263022739e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 4.972043010752688,
|
|
"grad_norm": 0.08337521404229342,
|
|
"learning_rate": 6.329637428701218e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 142355509.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1731
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227746963500977,
|
|
"advantages/var": 0.2732933831439368,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.974910394265233,
|
|
"grad_norm": 0.12437869174628784,
|
|
"learning_rate": 6.319172955245293e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 142433421.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.12179599702358246,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1732
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.977777777777778,
|
|
"grad_norm": 0.04592323494933643,
|
|
"learning_rate": 6.308713142093748e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 142510819.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 1733
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 5.749701315113695e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.980645161290322,
|
|
"grad_norm": 0.13970535726888655,
|
|
"learning_rate": 6.298258002489779e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 142594790.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1734
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344431558649841e-09,
|
|
"advantages/std": 0.5227810740470886,
|
|
"advantages/var": 0.27330005138182756,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 4.983512544802867,
|
|
"grad_norm": 0.15244272723685726,
|
|
"learning_rate": 6.287807549670663e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 142672168.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.986379928315412,
|
|
"grad_norm": 0.04776247984209465,
|
|
"learning_rate": 6.27736179686775e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 142748771.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1736
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.989247311827957,
|
|
"grad_norm": 0.2015861481177434,
|
|
"learning_rate": 6.266920757306429e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 142815438.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1737
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.992114695340502,
|
|
"grad_norm": 0.054341266197759155,
|
|
"learning_rate": 6.256484444206127e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 142895666.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1738
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 3.4497441148988883e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 4.994982078853047,
|
|
"grad_norm": 0.10739314990443329,
|
|
"learning_rate": 6.246052870780287e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 142984818.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 1739
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983628920516591e-09,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 4.997849462365592,
|
|
"grad_norm": 0.14704563705136212,
|
|
"learning_rate": 6.235626050236355e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 143054398.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 5.002867383512545,
|
|
"grad_norm": 0.051420375009002865,
|
|
"learning_rate": 6.225203995775745e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 143136232.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1741
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876049603820392e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.00573476702509,
|
|
"grad_norm": 0.10426521127479207,
|
|
"learning_rate": 6.214786720593853e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 143210767.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1742
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.4536284776845e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.008602150537635,
|
|
"grad_norm": 0.14482718305467646,
|
|
"learning_rate": 6.204374237880015e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 143292941.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1743
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 1.1266523706756892e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.011469534050179,
|
|
"grad_norm": 0.059359201068545704,
|
|
"learning_rate": 6.193966560817507e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 143365284.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1744
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.014336917562724,
|
|
"grad_norm": 0.058939128235451325,
|
|
"learning_rate": 6.183563702583506e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 143442545.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.017204301075269,
|
|
"grad_norm": 0.08724254557798017,
|
|
"learning_rate": 6.173165676349102e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 143526805.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1746
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599658819865184e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.020071684587814,
|
|
"grad_norm": 0.07225058271238646,
|
|
"learning_rate": 6.162772495279264e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 143608375.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 1747
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 7.966680494285092e-09,
|
|
"advantages/std": 0.46760883927345276,
|
|
"advantages/var": 0.21865802656666578,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 5.022939068100358,
|
|
"grad_norm": 0.09030024864295509,
|
|
"learning_rate": 6.152384172532819e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 143701698.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.11914245784282684,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 1748
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9916588263316927e-09,
|
|
"advantages/std": 0.46761149168014526,
|
|
"advantages/var": 0.21866050715133056,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.025806451612903,
|
|
"grad_norm": 0.14284937884647111,
|
|
"learning_rate": 6.142000721262458e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 143783579.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.12255740165710449,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1749
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.028673835125448,
|
|
"grad_norm": 0.0797696485104577,
|
|
"learning_rate": 6.131622154614683e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 143864045.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.5050695213580615e-09,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 5.031541218637993,
|
|
"grad_norm": 0.1158966547359424,
|
|
"learning_rate": 6.121248485729831e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 143950941.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1751
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.034408602150537,
|
|
"grad_norm": 0.05336124541650834,
|
|
"learning_rate": 6.110879727742027e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 144022571.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1752
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599693686636838e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 5.037275985663083,
|
|
"grad_norm": 0.06981000286252387,
|
|
"learning_rate": 6.100515893779188e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 144105423.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1753
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.979341600636731e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.040143369175627,
|
|
"grad_norm": 0.1363742168996095,
|
|
"learning_rate": 6.09015699696298e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 144176229.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1754
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.043010752688172,
|
|
"grad_norm": 0.04691676838152845,
|
|
"learning_rate": 6.079803050408836e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 144247996.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.045878136200717,
|
|
"grad_norm": 0.1530190523094471,
|
|
"learning_rate": 6.06945406722591e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 144314751.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1756
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.9875164192348156e-09,
|
|
"advantages/std": 0.4676070809364319,
|
|
"advantages/var": 0.21865638214189076,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.048745519713262,
|
|
"grad_norm": 0.09774635096609512,
|
|
"learning_rate": 6.05911006051708e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 144398766.0,
|
|
"reward": 0.6484375,
|
|
"reward_std": 0.12019838392734528,
|
|
"rewards/drgrpo_math_reward/mean": 0.6484375,
|
|
"rewards/drgrpo_math_reward/std": 0.4793342351913452,
|
|
"step": 1757
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 8.049402934764073e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 5.051612903225807,
|
|
"grad_norm": 0.1040979028863591,
|
|
"learning_rate": 6.048771043378911e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 144483471.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1758
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 5.054480286738351,
|
|
"grad_norm": 0.08131582776455688,
|
|
"learning_rate": 6.038437028901666e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 144563587.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1759
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.057347670250896,
|
|
"grad_norm": 0.05424336079130616,
|
|
"learning_rate": 6.028108030169265e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 144642656.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 5.060215053763441,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.017784060259279e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 144715111.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1761
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 8.450247070110804e-09,
|
|
"advantages/std": 0.661274790763855,
|
|
"advantages/var": 0.4372843488997802,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.063082437275986,
|
|
"grad_norm": 0.1660673494178795,
|
|
"learning_rate": 6.00746513224291e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 144807200.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.20753081142902374,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1762
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975244195968941e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.06594982078853,
|
|
"grad_norm": 0.18780836259720457,
|
|
"learning_rate": 5.997151259184979e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 144884781.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1763
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.2649061754629257e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 5.068817204301075,
|
|
"grad_norm": 0.07968917981618698,
|
|
"learning_rate": 5.98684245414391e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 144976402.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1764
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.07168458781362,
|
|
"grad_norm": 0.11952386312560818,
|
|
"learning_rate": 5.976538730171707e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 145052318.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.10205792635679245,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.074551971326165,
|
|
"grad_norm": 0.049016231424302455,
|
|
"learning_rate": 5.966240100313937e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 145116981.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1766
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.077419354838709,
|
|
"grad_norm": 0.048175777862949964,
|
|
"learning_rate": 5.95594657760972e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 145197382.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1767
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.053907636466336e-08,
|
|
"advantages/std": 0.618579626083374,
|
|
"advantages/var": 0.3826407538054468,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.080286738351255,
|
|
"grad_norm": 0.14543054577741166,
|
|
"learning_rate": 5.945658175091719e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 145276698.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.2001592218875885,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1768
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.967257841033182e-09,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.0831541218637994,
|
|
"grad_norm": 0.10064476075676881,
|
|
"learning_rate": 5.935374905786102e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 145353875.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1769
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.086021505376344,
|
|
"grad_norm": 0.036890267935205494,
|
|
"learning_rate": 5.925096782712538e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 145429444.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.088888888888889,
|
|
"grad_norm": 0.12470157001679923,
|
|
"learning_rate": 5.914823818884189e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 145507605.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.12415502220392227,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1771
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 5.091756272401434,
|
|
"grad_norm": 0.05869326667095858,
|
|
"learning_rate": 5.904556027307679e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 145587451.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1772
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.3942077395823529e-08,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.094623655913979,
|
|
"grad_norm": 0.10432344011039471,
|
|
"learning_rate": 5.894293420983089e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 145662900.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1773
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907256955369e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 5.097491039426523,
|
|
"grad_norm": 0.09054092079051783,
|
|
"learning_rate": 5.884036012903921e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 145746956.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1774
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 6.899541037727662e-09,
|
|
"advantages/std": 0.40494978427886963,
|
|
"advantages/var": 0.16398432778750305,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.100358422939068,
|
|
"grad_norm": 0.09423064128177898,
|
|
"learning_rate": 5.873783816057114e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 145811403.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.1579341628386252e-08,
|
|
"advantages/std": 0.5227928161621094,
|
|
"advantages/var": 0.2733123286307091,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.103225806451613,
|
|
"grad_norm": 0.12627040757335745,
|
|
"learning_rate": 5.863536843422995e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 145896309.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.1412346363067627,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 1776
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.106093189964158,
|
|
"grad_norm": 0.07948580732809564,
|
|
"learning_rate": 5.853295107975289e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 145962996.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1777
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125942055767658e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.108960573476702,
|
|
"grad_norm": 0.15472698679819985,
|
|
"learning_rate": 5.843058622681073e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 146049384.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.12415502220392227,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1778
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633244590331673e-09,
|
|
"advantages/std": 0.33065226674079895,
|
|
"advantages/var": 0.10933092150082846,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.111827956989248,
|
|
"grad_norm": 0.07359250655187671,
|
|
"learning_rate": 5.832827400500794e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 146122420.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.06233953312039375,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 1779
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 1.126665165824288e-08,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.114695340501792,
|
|
"grad_norm": 0.08254274226738148,
|
|
"learning_rate": 5.822601454388223e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 146200474.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.267469408556425e-08,
|
|
"advantages/std": 0.330655038356781,
|
|
"advantages/var": 0.10933275439072432,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.117562724014337,
|
|
"grad_norm": 0.0882255630075865,
|
|
"learning_rate": 5.812380797290463e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 146276500.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1781
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199591840825068e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.120430107526881,
|
|
"grad_norm": 0.10011928540464056,
|
|
"learning_rate": 5.802165442147911e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 146353975.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1782
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599796258942519e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.123297491039427,
|
|
"grad_norm": 0.12919290574892164,
|
|
"learning_rate": 5.791955401894248e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 146427886.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1783
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.1261648745519715,
|
|
"grad_norm": 0.10845132767806537,
|
|
"learning_rate": 5.781750689456435e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 146494899.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1784
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 5.633244590331673e-09,
|
|
"advantages/std": 0.33065226674079895,
|
|
"advantages/var": 0.10933092150082846,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 5.129032258064516,
|
|
"grad_norm": 0.04768167934092357,
|
|
"learning_rate": 5.771551317754691e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 146574156.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.06233953312039375,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.131899641577061,
|
|
"grad_norm": 0.025027818818730816,
|
|
"learning_rate": 5.76135729970246e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 146644653.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1786
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4049627482891083,
|
|
"advantages/var": 0.16399482750186767,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.134767025089606,
|
|
"grad_norm": 0.08388678335748852,
|
|
"learning_rate": 5.75116864820641e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 146725445.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.09127141535282135,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1787
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.137634408602151,
|
|
"grad_norm": 0.08288362155938514,
|
|
"learning_rate": 5.740985376166422e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 146817818.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 1788
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.140501792114695,
|
|
"grad_norm": 0.04871816815622585,
|
|
"learning_rate": 5.730807496475567e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 146890478.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1789
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.14336917562724,
|
|
"grad_norm": 0.08230015577181828,
|
|
"learning_rate": 5.720635022020082e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 146968403.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.970911630250105e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 5.146236559139785,
|
|
"grad_norm": 0.07019564524774083,
|
|
"learning_rate": 5.710467965679355e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 147051471.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1791
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.13136821560598e-09,
|
|
"advantages/std": 0.5726727247238159,
|
|
"advantages/var": 0.32795404964259944,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.14910394265233,
|
|
"grad_norm": 0.11210520406452633,
|
|
"learning_rate": 5.700306340325931e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 147139169.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.1462520956993103,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1792
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.151971326164874,
|
|
"grad_norm": 0.10304077543283272,
|
|
"learning_rate": 5.690150158825462e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 147215606.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1793
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.15483870967742,
|
|
"grad_norm": 0.06774208086141432,
|
|
"learning_rate": 5.679999434036724e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 147283802.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1794
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.157706093189964,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.669854178811564e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 147348294.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983539800525091e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.160573476702509,
|
|
"grad_norm": 0.10173609497794077,
|
|
"learning_rate": 5.659714405994925e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 147425935.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1796
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.0349462367204652e-08,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.163440860215053,
|
|
"grad_norm": 0.12454392102126001,
|
|
"learning_rate": 5.649580128424791e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 147505940.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1797
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.2946288161655503e-08,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.166308243727599,
|
|
"grad_norm": 0.08493938281601522,
|
|
"learning_rate": 5.639451358932203e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 147594173.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1798
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.1691756272401435,
|
|
"grad_norm": 0.0887236559371361,
|
|
"learning_rate": 5.629328110341217e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 147673586.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 1799
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.2998246708054356e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 5.172043010752688,
|
|
"grad_norm": 0.06787093086525699,
|
|
"learning_rate": 5.619210395468907e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 147768016.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.2946288161655503e-08,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.174910394265233,
|
|
"grad_norm": 0.11292903222098369,
|
|
"learning_rate": 5.609098227125333e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 147838620.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1801
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907256955369e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.177777777777778,
|
|
"grad_norm": 0.11244689407496437,
|
|
"learning_rate": 5.598991618113542e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 147924707.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1802
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 1.1266652673742488e-08,
|
|
"advantages/std": 0.33064746856689453,
|
|
"advantages/var": 0.10932774846969551,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.180645161290323,
|
|
"grad_norm": 0.07870096063958429,
|
|
"learning_rate": 5.58889058122953e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 147992761.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1803
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.6721015324148586e-09,
|
|
"advantages/std": 0.5228034257888794,
|
|
"advantages/var": 0.2733234220165883,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.183512544802867,
|
|
"grad_norm": 0.20088785350237562,
|
|
"learning_rate": 5.578795129262254e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 148068237.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.15360605716705322,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1804
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306550085544586,
|
|
"advantages/var": 0.1093327346821491,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 5.186379928315413,
|
|
"grad_norm": 0.06858851307313002,
|
|
"learning_rate": 5.568705274993584e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 148158495.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675827622413635,
|
|
"advantages/var": 0.2186336395452635,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.189247311827957,
|
|
"grad_norm": 0.1523815068034485,
|
|
"learning_rate": 5.558621031198317e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 148236114.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1806
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.987538125611118e-09,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.192114695340502,
|
|
"grad_norm": 0.1202866833437615,
|
|
"learning_rate": 5.548542410644132e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 148321911.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1807
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 5.749390312251308e-09,
|
|
"advantages/std": 0.4049657881259918,
|
|
"advantages/var": 0.1639972895525057,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.194982078853046,
|
|
"grad_norm": 0.09572978561770634,
|
|
"learning_rate": 5.538469426091595e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 148397750.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.094686359167099,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1808
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299846843318419e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.197849462365592,
|
|
"grad_norm": 0.09354703080827789,
|
|
"learning_rate": 5.528402090294142e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 148471544.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1809
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983349132682101e-09,
|
|
"advantages/std": 0.4676077961921692,
|
|
"advantages/var": 0.21865705105969724,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.200716845878136,
|
|
"grad_norm": 0.13539962414403606,
|
|
"learning_rate": 5.518340415998055e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 148558072.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.12125921249389648,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958501673983143e-09,
|
|
"advantages/std": 0.4676017463207245,
|
|
"advantages/var": 0.21865139316219118,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.203584229390681,
|
|
"grad_norm": 0.11131124567062191,
|
|
"learning_rate": 5.508284415942441e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 148642583.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.11336849629878998,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1811
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125909557323754e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.2064516129032254,
|
|
"grad_norm": 0.15026819642465544,
|
|
"learning_rate": 5.498234102859222e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 148722128.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1812
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.209318996415771,
|
|
"grad_norm": 0.06661326814815241,
|
|
"learning_rate": 5.488189489473131e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 148799408.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1813
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.991766726549734e-09,
|
|
"advantages/std": 0.4675861597061157,
|
|
"advantages/var": 0.21863681674871316,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.2121863799283155,
|
|
"grad_norm": 0.10657950688635445,
|
|
"learning_rate": 5.478150588501681e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 148871489.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1814
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299897960206267e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.21505376344086,
|
|
"grad_norm": 0.11086951659826781,
|
|
"learning_rate": 5.468117412655147e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 148942135.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380984365940094,
|
|
"advantages/var": 0.05466704299203351,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.217921146953405,
|
|
"grad_norm": 0.0813492842945334,
|
|
"learning_rate": 5.458089974636551e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 149013589.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1816
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.22078853046595,
|
|
"grad_norm": 0.09958757792398544,
|
|
"learning_rate": 5.448068287141662e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 149087877.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1817
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899474012416308e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 5.223655913978495,
|
|
"grad_norm": 0.08548723075463324,
|
|
"learning_rate": 5.438052362858974e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 149180715.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.08443662524223328,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1818
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.226523297491039,
|
|
"grad_norm": 0.12481555793055853,
|
|
"learning_rate": 5.428042214469661e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 149272076.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1819
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.229390681003585,
|
|
"grad_norm": 0.12064713889761638,
|
|
"learning_rate": 5.418037854647599e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 149354092.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966813525430481e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.232258064516129,
|
|
"grad_norm": 0.09892394825246684,
|
|
"learning_rate": 5.408039296059334e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 149442991.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1821
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975144418133046e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.235125448028674,
|
|
"grad_norm": 0.10791824615849031,
|
|
"learning_rate": 5.398046551364078e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 149524910.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1822
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 5.237992831541218,
|
|
"grad_norm": 0.09244461993010375,
|
|
"learning_rate": 5.388059633213651e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 149616419.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1823
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975144418133046e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.240860215053764,
|
|
"grad_norm": 0.16256213728805313,
|
|
"learning_rate": 5.378078554252523e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 149688072.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1824
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.3941844586185219e-08,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 5.243727598566308,
|
|
"grad_norm": 0.09845655874877357,
|
|
"learning_rate": 5.368103327117768e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 149772056.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.246594982078853,
|
|
"grad_norm": 0.059794942562653566,
|
|
"learning_rate": 5.35813396443904e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 149847887.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1826
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.2494623655913975,
|
|
"grad_norm": 0.15207452579939762,
|
|
"learning_rate": 5.348170478838579e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 149932170.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1827
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.2674839170101503e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.252329749103943,
|
|
"grad_norm": 0.11597795702763866,
|
|
"learning_rate": 5.338212882931172e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 150003031.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 1828
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 5.2551971326164875,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 5.328261189324166e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 150072094.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1829
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917813257124117e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 5.258064516129032,
|
|
"grad_norm": 0.13879677550315522,
|
|
"learning_rate": 5.318315410617417e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 150148892.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.09522313624620438,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 9.199209994879892e-09,
|
|
"advantages/std": 0.4049576222896576,
|
|
"advantages/var": 0.16399067585049298,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 5.260931899641577,
|
|
"grad_norm": 0.08015474128312423,
|
|
"learning_rate": 5.308375559403306e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 150233854.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.08891239762306213,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1831
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983406762715241e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.263799283154122,
|
|
"grad_norm": 0.1087398856557787,
|
|
"learning_rate": 5.298441648266699e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 150312596.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1832
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 1.126665165824288e-08,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 5.266666666666667,
|
|
"grad_norm": 0.03697671803596281,
|
|
"learning_rate": 5.28851368978495e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 150406110.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1833
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.757495615940373e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.269534050179211,
|
|
"grad_norm": 0.12998212949409493,
|
|
"learning_rate": 5.278591696527868e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 150486513.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1834
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.272401433691757,
|
|
"grad_norm": 0.07672766562987829,
|
|
"learning_rate": 5.268675681057719e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 150553755.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.275268817204301,
|
|
"grad_norm": 0.111884856419137,
|
|
"learning_rate": 5.258765655929188e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 150637852.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1836
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966859224177393e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.278136200716846,
|
|
"grad_norm": 0.0874979299947862,
|
|
"learning_rate": 5.248861633689391e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 150718790.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1837
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899641578136434e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.28100358422939,
|
|
"grad_norm": 0.1155783267271921,
|
|
"learning_rate": 5.238963626877828e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 150798216.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 1838
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.283870967741936,
|
|
"grad_norm": 0.08954806118932522,
|
|
"learning_rate": 5.229071648026398e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 150883238.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1839
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814615465526806e-09,
|
|
"advantages/std": 0.5227856636047363,
|
|
"advantages/var": 0.27330485007064453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.28673835125448,
|
|
"grad_norm": 0.16870842199738778,
|
|
"learning_rate": 5.219185709659354e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 150968520.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.13098981976509094,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 5.749701315113695e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.289605734767025,
|
|
"grad_norm": 0.09873181563072708,
|
|
"learning_rate": 5.209305824293307e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 151054344.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1841
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 7.52825480422812e-09,
|
|
"advantages/std": 0.6185514330863953,
|
|
"advantages/var": 0.3826058753732333,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.29247311827957,
|
|
"grad_norm": 0.1421703163138954,
|
|
"learning_rate": 5.199432004437205e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 151132679.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.1649293452501297,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1842
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380984365940094,
|
|
"advantages/var": 0.05466704299203351,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.295340501792115,
|
|
"grad_norm": 0.05798456668261622,
|
|
"learning_rate": 5.189564262592326e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 151200553.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1843
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.2982078853046595,
|
|
"grad_norm": 0.03668176632247106,
|
|
"learning_rate": 5.179702611252231e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 151267289.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1844
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 5.301075268817204,
|
|
"grad_norm": 0.1789702658219247,
|
|
"learning_rate": 5.169847062902784e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 151355476.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"advantages/mean": 5.587935447692871e-09,
|
|
"advantages/snr": 1.068867300569461e-08,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 5.3039426523297495,
|
|
"grad_norm": 0.10527680576301442,
|
|
"learning_rate": 5.159997630022119e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 151439451.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1846
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.9792225930357895e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.306810035842294,
|
|
"grad_norm": 0.09277743427521075,
|
|
"learning_rate": 5.150154325080636e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 151518552.0,
|
|
"reward": 0.609375,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.609375,
|
|
"rewards/drgrpo_math_reward/std": 0.4898075461387634,
|
|
"step": 1847
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494468808174133,
|
|
"advantages/var": 0.16398020040561878,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 5.309677419354839,
|
|
"grad_norm": 0.10411630799642824,
|
|
"learning_rate": 5.140317160540961e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 151603921.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1848
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.967079601050182e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.312544802867383,
|
|
"grad_norm": 0.11719080496103142,
|
|
"learning_rate": 5.130486148857951e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 151678722.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 1849
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.672149362599044e-09,
|
|
"advantages/std": 0.5227940678596497,
|
|
"advantages/var": 0.27331363738923997,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.315412186379929,
|
|
"grad_norm": 0.1683067073965903,
|
|
"learning_rate": 5.120661302478677e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 151756456.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.1433563083410263,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958683201273463e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.318279569892473,
|
|
"grad_norm": 0.11269277479100119,
|
|
"learning_rate": 5.110842633842405e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 151828491.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1851
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.494855892459685e-08,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.321146953405018,
|
|
"grad_norm": 0.10160086799341961,
|
|
"learning_rate": 5.101030155380575e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 151905489.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1852
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 5.63344972278721e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.324014336917562,
|
|
"grad_norm": 0.08345050577257829,
|
|
"learning_rate": 5.091223879516784e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 151994257.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1853
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.326881720430108,
|
|
"grad_norm": 0.06111620190505675,
|
|
"learning_rate": 5.081423818666787e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 152074009.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 1854
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5726876854896545,
|
|
"advantages/var": 0.3279711851114975,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 5.329749103942652,
|
|
"grad_norm": 0.19723566690935937,
|
|
"learning_rate": 5.071629985238473e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 152160116.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.1633366346359253,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975244195968941e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 5.332616487455197,
|
|
"grad_norm": 0.1322883239193507,
|
|
"learning_rate": 5.061842391631826e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 152243787.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1856
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.335483870967742,
|
|
"grad_norm": 0.0616129656091537,
|
|
"learning_rate": 5.05206105023895e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 152316961.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1857
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299811976811062e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.338351254480287,
|
|
"grad_norm": 0.08578285901724589,
|
|
"learning_rate": 5.042285973444027e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 152397505.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1858
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917811987622486e-09,
|
|
"advantages/std": 0.4675827622413635,
|
|
"advantages/var": 0.2186336395452635,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.3412186379928315,
|
|
"grad_norm": 0.10947534256692386,
|
|
"learning_rate": 5.032517173623305e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 152466137.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.09522313624620438,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1859
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 5.344086021505376,
|
|
"grad_norm": 0.08082531946198057,
|
|
"learning_rate": 5.022754663145081e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 152551397.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998806953079044e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.3469534050179215,
|
|
"grad_norm": 0.13423104771148225,
|
|
"learning_rate": 5.0129984543697e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 152626323.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1861
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380985856056213,
|
|
"advantages/var": 0.05466704996011007,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.349820788530466,
|
|
"grad_norm": 0.029017165456185587,
|
|
"learning_rate": 5.003248559649525e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 152702387.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1862
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9875209889720355e-09,
|
|
"advantages/std": 0.4676063656806946,
|
|
"advantages/var": 0.21865571322510746,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.352688172043011,
|
|
"grad_norm": 0.14866076394896222,
|
|
"learning_rate": 4.993504991328913e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 152772860.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.11913755536079407,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1863
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.958683201273463e-10,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 5.355555555555555,
|
|
"grad_norm": 0.13050036332563442,
|
|
"learning_rate": 4.983767761744229e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 152839086.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1864
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.358422939068101,
|
|
"grad_norm": 0.07580217828083215,
|
|
"learning_rate": 4.974036883223798e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 152922350.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.10205792635679245,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.252446745927492e-09,
|
|
"advantages/std": 0.5726904273033142,
|
|
"advantages/var": 0.3279743255248526,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.361290322580645,
|
|
"grad_norm": 0.13115111983883432,
|
|
"learning_rate": 4.964312368087915e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 152999796.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.16781240701675415,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1866
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.2523980013455208e-09,
|
|
"advantages/std": 0.5726990103721619,
|
|
"advantages/var": 0.32798415648125356,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 5.36415770609319,
|
|
"grad_norm": 0.13322988909806766,
|
|
"learning_rate": 4.954594228648806e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 153087722.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.17806214094161987,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1867
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.367025089605734,
|
|
"grad_norm": 0.06590568814798663,
|
|
"learning_rate": 4.944882477210641e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 153159296.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1868
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.962814881146116e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.36989247311828,
|
|
"grad_norm": 0.10051890127853474,
|
|
"learning_rate": 4.935177126069484e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 153240775.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 1869
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599521727490371e-09,
|
|
"advantages/std": 0.40496495366096497,
|
|
"advantages/var": 0.1639966136936275,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 5.372759856630824,
|
|
"grad_norm": 0.09394611314446655,
|
|
"learning_rate": 4.925478187513312e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 153325009.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.0936255231499672,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.375627240143369,
|
|
"grad_norm": 0.08158773125655303,
|
|
"learning_rate": 4.91578567382197e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 153405905.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1871
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958683201273463e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 5.378494623655914,
|
|
"grad_norm": 0.11164679408804816,
|
|
"learning_rate": 4.906099597267177e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 153483337.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1872
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.033819584201132e-09,
|
|
"advantages/std": 0.618557333946228,
|
|
"advantages/var": 0.38261317537866546,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.381362007168459,
|
|
"grad_norm": 0.16164954065256135,
|
|
"learning_rate": 4.896419970112499e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 153557165.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.17176413536071777,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1873
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.379866977655094e-08,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.3842293906810035,
|
|
"grad_norm": 0.16709996664347315,
|
|
"learning_rate": 4.886746804613332e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 153641555.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1874
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.387096774193548,
|
|
"grad_norm": 0.05553338294295206,
|
|
"learning_rate": 4.877080113016897e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 153708630.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.3899641577060935,
|
|
"grad_norm": 0.08450687755165441,
|
|
"learning_rate": 4.867419907562222e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 153780975.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1876
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 6.899541037727662e-09,
|
|
"advantages/std": 0.40494978427886963,
|
|
"advantages/var": 0.16398432778750305,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.392831541218638,
|
|
"grad_norm": 0.12972421739965487,
|
|
"learning_rate": 4.857766200480115e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 153857618.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1877
|
|
},
|
|
{
|
|
"advantages/mean": 6.05359673500061e-09,
|
|
"advantages/snr": 1.494855892459685e-08,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.395698924731183,
|
|
"grad_norm": 0.13205023635736984,
|
|
"learning_rate": 4.848119003993151e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 153937816.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1878
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199317639730369e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 5.398566308243727,
|
|
"grad_norm": 0.07242213255933978,
|
|
"learning_rate": 4.838478330315677e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 154010870.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1879
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 5.401433691756273,
|
|
"grad_norm": 0.023759560310194276,
|
|
"learning_rate": 4.828844191653776e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 154085353.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.404301075268817,
|
|
"grad_norm": 0.07006989525161213,
|
|
"learning_rate": 4.819216600205254e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 154166297.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1881
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.407168458781362,
|
|
"grad_norm": 0.08724927934268048,
|
|
"learning_rate": 4.809595568159622e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 154248448.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1882
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 5.410035842293907,
|
|
"grad_norm": 0.03765949863075639,
|
|
"learning_rate": 4.799981107698097e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 154330364.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1883
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 7.317935107621051e-09,
|
|
"advantages/std": 0.5726959109306335,
|
|
"advantages/var": 0.32798060639666815,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 5.412903225806452,
|
|
"grad_norm": 0.1071459123786889,
|
|
"learning_rate": 4.790373230993578e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 154416627.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.17676395177841187,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1884
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.415770609318996,
|
|
"grad_norm": 0.10068983348235683,
|
|
"learning_rate": 4.780771950210615e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 154498074.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.0689055673126508e-08,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.418637992831541,
|
|
"grad_norm": 0.11209554967802351,
|
|
"learning_rate": 4.771177277505412e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 154583254.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 1886
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814855139419146e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.421505376344086,
|
|
"grad_norm": 0.14202464929928846,
|
|
"learning_rate": 4.761589225025811e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 154665590.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.12415502220392227,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1887
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 5.424372759856631,
|
|
"grad_norm": 0.1849833455302111,
|
|
"learning_rate": 4.7520078049112764e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 154742173.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 1888
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.4272401433691755,
|
|
"grad_norm": 0.09493768489189344,
|
|
"learning_rate": 4.742433029292855e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 154824204.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1889
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917480653229804e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.43010752688172,
|
|
"grad_norm": 0.10117158992485455,
|
|
"learning_rate": 4.7328649102932005e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 154916159.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917366402546925e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.4329749103942655,
|
|
"grad_norm": 0.11939070460326162,
|
|
"learning_rate": 4.7233034600265373e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 154990673.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1891
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 6.505166341645741e-09,
|
|
"advantages/std": 0.5726664066314697,
|
|
"advantages/var": 0.32794681328419983,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.43584229390681,
|
|
"grad_norm": 0.23549992551626983,
|
|
"learning_rate": 4.713748690598637e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 155062263.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.13941732048988342,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1892
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917669804463866e-09,
|
|
"advantages/std": 0.46758610010147095,
|
|
"advantages/var": 0.2186367610081028,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.438709677419355,
|
|
"grad_norm": 0.10053341096553804,
|
|
"learning_rate": 4.7042006141068123e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 155139301.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1893
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975041977944071e-09,
|
|
"advantages/std": 0.4676063656806946,
|
|
"advantages/var": 0.21865571322510746,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.4415770609319,
|
|
"grad_norm": 0.10090695810415312,
|
|
"learning_rate": 4.6946592426399134e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 155218436.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.11913755536079407,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1894
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814647963303764e-09,
|
|
"advantages/std": 0.5227847099304199,
|
|
"advantages/var": 0.2733038529370333,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.444444444444445,
|
|
"grad_norm": 0.23038553513838003,
|
|
"learning_rate": 4.685124588278296e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 155297595.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.1331065595149994,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"advantages/mean": -7.916241884231567e-09,
|
|
"advantages/snr": 1.6929196050355823e-08,
|
|
"advantages/std": 0.46760883927345276,
|
|
"advantages/var": 0.21865802656666578,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.447311827956989,
|
|
"grad_norm": 0.0979355768194412,
|
|
"learning_rate": 4.6755966630938084e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 155376489.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.11914245784282684,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1896
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.450179211469534,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.6660754791497745e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 155445844.0,
|
|
"reward": 1.0,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 1.0,
|
|
"rewards/drgrpo_math_reward/std": 0.0,
|
|
"step": 1897
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.453046594982079,
|
|
"grad_norm": 0.0725629879509453,
|
|
"learning_rate": 4.6565610485009953e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 155526491.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1898
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.4251692372421445e-08,
|
|
"advantages/std": 0.5227856636047363,
|
|
"advantages/var": 0.27330485007064453,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.455913978494624,
|
|
"grad_norm": 0.12173701527788272,
|
|
"learning_rate": 4.6470533831937167e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 155604914.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.13098981976509094,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1899
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5629234993268783e-09,
|
|
"advantages/std": 0.5227856040000916,
|
|
"advantages/var": 0.27330478774974054,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.458781362007168,
|
|
"grad_norm": 0.11586136349616745,
|
|
"learning_rate": 4.637552495265616e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 155683803.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.13098981976509094,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.461648745519713,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.628058396745786e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 155753711.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1901
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.979214976743015e-09,
|
|
"advantages/std": 0.4676051139831543,
|
|
"advantages/var": 0.21865454262319872,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.464516129032258,
|
|
"grad_norm": 0.1331306280504158,
|
|
"learning_rate": 4.6185710996547343e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 155832649.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.11784426867961884,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 1902
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.467383512544803,
|
|
"grad_norm": 0.09456138483613138,
|
|
"learning_rate": 4.609090616004354e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 155908172.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 1903
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.958278899535023e-10,
|
|
"advantages/std": 0.46761220693588257,
|
|
"advantages/var": 0.21866117607544666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 5.4702508960573475,
|
|
"grad_norm": 0.09422582550936447,
|
|
"learning_rate": 4.599616957797903e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 155987999.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.1236182376742363,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1904
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.149940263022739e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.473118279569892,
|
|
"grad_norm": 0.10386660888721715,
|
|
"learning_rate": 4.590150137030009e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 156085665.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5628065113579712e-09,
|
|
"advantages/std": 0.5228027701377869,
|
|
"advantages/var": 0.2733227364637436,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 5.4759856630824375,
|
|
"grad_norm": 0.16402560566274624,
|
|
"learning_rate": 4.5806901656866357e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 156165779.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.15254521369934082,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1906
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.6721801332404694e-09,
|
|
"advantages/std": 0.5227880477905273,
|
|
"advantages/var": 0.2733073429126307,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.478853046594982,
|
|
"grad_norm": 0.11023671658190864,
|
|
"learning_rate": 4.571237055745073e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 156251407.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1907
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 5.481720430107527,
|
|
"grad_norm": 0.06128699635907892,
|
|
"learning_rate": 4.5617908191739296e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 156320924.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1908
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.484587813620072,
|
|
"grad_norm": 0.04613452362847414,
|
|
"learning_rate": 4.5523514679331143e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 156400228.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 1909
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.967258348850795e-09,
|
|
"advantages/std": 0.46757492423057556,
|
|
"advantages/var": 0.21862630976922848,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.487455197132617,
|
|
"grad_norm": 0.07612845603329545,
|
|
"learning_rate": 4.5429190139738084e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 156477372.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.490322580645161,
|
|
"grad_norm": 0.058345033045249815,
|
|
"learning_rate": 4.533493469238464e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 156560690.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1911
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.493189964157706,
|
|
"grad_norm": 0.06742502881837097,
|
|
"learning_rate": 4.524074845660788e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 156642607.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1912
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633244590331673e-09,
|
|
"advantages/std": 0.33065226674079895,
|
|
"advantages/var": 0.10933092150082846,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.496057347670251,
|
|
"grad_norm": 0.06443206846731298,
|
|
"learning_rate": 4.51466315516573e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 156731575.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.06233953312039375,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1913
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.149914704966296e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 5.498924731182796,
|
|
"grad_norm": 0.09620239765164847,
|
|
"learning_rate": 4.505258409669449e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 156818167.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 1914
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.50179211469534,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.495860621079315e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 156895505.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.1498668924479387e-09,
|
|
"advantages/std": 0.404969722032547,
|
|
"advantages/var": 0.16400047576311838,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.504659498207886,
|
|
"grad_norm": 0.11654146819249533,
|
|
"learning_rate": 4.486469801293893e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 156973942.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.09916213154792786,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 1916
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.50752688172043,
|
|
"grad_norm": 0.0464231644339536,
|
|
"learning_rate": 4.47708596220293e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 157046572.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 1917
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.510394265232975,
|
|
"grad_norm": 0.11270588447924891,
|
|
"learning_rate": 4.467709115687324e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 157111547.0,
|
|
"reward": 0.9765625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9765625,
|
|
"rewards/drgrpo_math_reward/std": 0.15188287198543549,
|
|
"step": 1918
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.97501037071382e-09,
|
|
"advantages/std": 0.46760883927345276,
|
|
"advantages/var": 0.21865802656666578,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 5.5132616487455195,
|
|
"grad_norm": 0.11110330877431249,
|
|
"learning_rate": 4.4583392736191184e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 157197714.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.11914245784282684,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1919
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 5.7495735248314805e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.516129032258064,
|
|
"grad_norm": 0.09720678227725901,
|
|
"learning_rate": 4.448976447861499e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 157285191.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"advantages/mean": 7.916241884231567e-09,
|
|
"advantages/snr": 1.6929356816321683e-08,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.5189964157706095,
|
|
"grad_norm": 0.14978010239675016,
|
|
"learning_rate": 4.4396206502687703e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 157363001.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1921
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.521863799283154,
|
|
"grad_norm": 0.08382491164784144,
|
|
"learning_rate": 4.430271892686317e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 157440871.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1922
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380985856056213,
|
|
"advantages/var": 0.05466704996011007,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.524731182795699,
|
|
"grad_norm": 0.07748904121356069,
|
|
"learning_rate": 4.420930186950631e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 157515620.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1923
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.967124795048994e-09,
|
|
"advantages/std": 0.4675827622413635,
|
|
"advantages/var": 0.2186336395452635,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.527598566308244,
|
|
"grad_norm": 0.1088886868490235,
|
|
"learning_rate": 4.4115955448892725e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 157588147.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1924
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 5.530465949820789,
|
|
"grad_norm": 0.06511628825323257,
|
|
"learning_rate": 4.402267978320854e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 157676182.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.533333333333333,
|
|
"grad_norm": 0.10998579681902464,
|
|
"learning_rate": 4.392947499055024e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 157746767.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1926
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.5055872703345218e-09,
|
|
"advantages/std": 0.6185775995254517,
|
|
"advantages/var": 0.38263824663467005,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 5.536200716845878,
|
|
"grad_norm": 0.1398529103422478,
|
|
"learning_rate": 4.383634118892472e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 157838953.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.19674429297447205,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1927
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.539068100358423,
|
|
"grad_norm": 0.0903178064579871,
|
|
"learning_rate": 4.3743278496248926e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 157907395.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1928
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599512249801046e-09,
|
|
"advantages/std": 0.4049657881259918,
|
|
"advantages/var": 0.1639972895525057,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.541935483870968,
|
|
"grad_norm": 0.0894662363886373,
|
|
"learning_rate": 4.365028703034975e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 157981381.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.094686359167099,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1929
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.2649061754629257e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.544802867383512,
|
|
"grad_norm": 0.11168755722385075,
|
|
"learning_rate": 4.355736690896389e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 158062896.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 5.547670250896058,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 4.3464518249737757e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 158146371.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1931
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.550537634408602,
|
|
"grad_norm": 0.06435845077798222,
|
|
"learning_rate": 4.337174117022733e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 158242393.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1932
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.0539321109852954e-08,
|
|
"advantages/std": 0.6185652613639832,
|
|
"advantages/var": 0.3826229825662928,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 5.553405017921147,
|
|
"grad_norm": 0.13040252276106876,
|
|
"learning_rate": 4.3279035787897845e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 158336544.0,
|
|
"reward": 0.59375,
|
|
"reward_std": 0.1820138692855835,
|
|
"rewards/drgrpo_math_reward/mean": 0.59375,
|
|
"rewards/drgrpo_math_reward/std": 0.4930621087551117,
|
|
"step": 1933
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 5.633261853378446e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 5.5562724014336915,
|
|
"grad_norm": 0.03814541076256376,
|
|
"learning_rate": 4.3186402220123807e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 158422133.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1934
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899474012416308e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.559139784946236,
|
|
"grad_norm": 0.12475499796193422,
|
|
"learning_rate": 4.3093840584188834e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 158495357.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.5620071684587815,
|
|
"grad_norm": 0.11435696114547454,
|
|
"learning_rate": 4.300135099728549e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 158571381.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1936
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876049603820392e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.564874551971326,
|
|
"grad_norm": 0.14605670522014397,
|
|
"learning_rate": 4.290893357651502e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 158644594.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1937
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917668534980524e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.567741935483871,
|
|
"grad_norm": 1.6022086039464418,
|
|
"learning_rate": 4.2816588438887336e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 158717798.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1938
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344527836563254e-09,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.570609318996416,
|
|
"grad_norm": 0.1015551597876325,
|
|
"learning_rate": 4.2724315701320913e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 158799021.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 1939
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.573476702508961,
|
|
"grad_norm": 0.05159154932555391,
|
|
"learning_rate": 4.2632115480642415e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 158874489.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 8.907335152797101e-10,
|
|
"advantages/std": 0.5227840542793274,
|
|
"advantages/var": 0.27330316740873073,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.576344086021505,
|
|
"grad_norm": 0.11100900508926137,
|
|
"learning_rate": 4.2539987893586825e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 158965012.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.13204574584960938,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 1941
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.4251629002153753e-08,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 5.57921146953405,
|
|
"grad_norm": 0.10140498510511371,
|
|
"learning_rate": 4.244793305679715e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 159049841.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1942
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.0689055673126508e-08,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.582078853046595,
|
|
"grad_norm": 0.1923077422965147,
|
|
"learning_rate": 4.2355951086824195e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 159119943.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.1173202320933342,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1943
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 4.8788204215685765e-09,
|
|
"advantages/std": 0.5726727843284607,
|
|
"advantages/var": 0.32795411791051166,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.58494623655914,
|
|
"grad_norm": 0.11581234230628372,
|
|
"learning_rate": 4.226404210012654e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 159202580.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.1462520956993103,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 1944
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.4083154633446115e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.587813620071684,
|
|
"grad_norm": 0.05502996160027891,
|
|
"learning_rate": 4.217220621307043e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 159282107.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227847099304199,
|
|
"advantages/var": 0.2733038529370333,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 5.59068100358423,
|
|
"grad_norm": 0.1209754534041952,
|
|
"learning_rate": 4.2080443541929534e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 159375869.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.13310657441616058,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1946
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.4497441148988883e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.593548387096774,
|
|
"grad_norm": 0.10984024628788239,
|
|
"learning_rate": 4.198875420288477e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 159453028.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1947
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.596415770609319,
|
|
"grad_norm": 0.11507229836869538,
|
|
"learning_rate": 4.189713831202419e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 159535671.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1948
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.5992831541218635,
|
|
"grad_norm": 0.1137030854495678,
|
|
"learning_rate": 4.1805595985342967e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 159620953.0,
|
|
"reward": 0.9765625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9765625,
|
|
"rewards/drgrpo_math_reward/std": 0.15188287198543549,
|
|
"step": 1949
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.907227504745508e-10,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.602150537634409,
|
|
"grad_norm": 0.0914234940800283,
|
|
"learning_rate": 4.1714127338743086e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 159697331.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.149914704966296e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 5.6050179211469535,
|
|
"grad_norm": 0.14375288366487707,
|
|
"learning_rate": 4.162273248803322e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 159789361.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1951
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.607885304659498,
|
|
"grad_norm": 0.07067283524310007,
|
|
"learning_rate": 4.1531411548928554e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 159854093.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 1952
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.781453422160313e-09,
|
|
"advantages/std": 0.5227880477905273,
|
|
"advantages/var": 0.2733073429126307,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 5.610752688172043,
|
|
"grad_norm": 0.10935663458432411,
|
|
"learning_rate": 4.14401646370508e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 159945699.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1953
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299807237755752e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 5.613620071684588,
|
|
"grad_norm": 0.1492686257219773,
|
|
"learning_rate": 4.1348991867927987e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 160038116.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.08785156905651093,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1954
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125814501076877e-09,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.616487455197133,
|
|
"grad_norm": 0.2273844241640259,
|
|
"learning_rate": 4.1257893356994036e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 160133466.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983467187183905e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 5.619354838709677,
|
|
"grad_norm": 0.089889769658418,
|
|
"learning_rate": 4.116686921958907e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 160213627.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1956
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 5.7495735248314805e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.622222222222222,
|
|
"grad_norm": 0.11244164019978824,
|
|
"learning_rate": 4.1075919570959026e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 160289304.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1957
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.2945998544314845e-08,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.625089605734767,
|
|
"grad_norm": 0.09884065280090722,
|
|
"learning_rate": 4.098504452625544e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 160368798.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1958
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.562855255411359e-09,
|
|
"advantages/std": 0.5227956175804138,
|
|
"advantages/var": 0.2733152577612863,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.627956989247312,
|
|
"grad_norm": 0.12041629172262175,
|
|
"learning_rate": 4.08942442005354e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 160457466.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.14230038225650787,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1959
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.630824372759856,
|
|
"grad_norm": 0.11398688046581287,
|
|
"learning_rate": 4.0803518708761455e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 160543054.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"advantages/mean": 8.847564458847046e-09,
|
|
"advantages/snr": 1.692320165841484e-08,
|
|
"advantages/std": 0.5228067636489868,
|
|
"advantages/var": 0.27332691211712756,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 5.633691756272402,
|
|
"grad_norm": 0.08094169219006273,
|
|
"learning_rate": 4.0712868165801416e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 160636921.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.15490421652793884,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 1961
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.63344972278721e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.636559139784946,
|
|
"grad_norm": 0.10298150500513412,
|
|
"learning_rate": 4.0622292686428136e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 160710063.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1962
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.639426523297491,
|
|
"grad_norm": 0.10455267483184219,
|
|
"learning_rate": 4.053179238531943e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 160782092.0,
|
|
"reward": 0.9765625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9765625,
|
|
"rewards/drgrpo_math_reward/std": 0.15188287198543549,
|
|
"step": 1963
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.6422939068100355,
|
|
"grad_norm": 0.08218972282732463,
|
|
"learning_rate": 4.044136737705797e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 160848771.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1964
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262250655110813e-09,
|
|
"advantages/std": 0.5726898312568665,
|
|
"advantages/var": 0.3279736428250182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.645161290322581,
|
|
"grad_norm": 0.13816684884236538,
|
|
"learning_rate": 4.0351017776131125e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 160925428.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.16675157845020294,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299846843318419e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.6480286738351255,
|
|
"grad_norm": 0.12260067352969628,
|
|
"learning_rate": 4.0260743696930733e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 160998889.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 1966
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.65089605734767,
|
|
"grad_norm": 0.04081103779018167,
|
|
"learning_rate": 4.0170545253752984e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 161071658.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1967
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 6.8994882297977766e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.6537634408602155,
|
|
"grad_norm": 0.08007249811362589,
|
|
"learning_rate": 4.00804225607984e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 161147839.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1968
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.449710856633628e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.65663082437276,
|
|
"grad_norm": 0.07698763572693008,
|
|
"learning_rate": 3.9990375732171566e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 161230330.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1969
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.659498207885305,
|
|
"grad_norm": 0.08224400664250893,
|
|
"learning_rate": 3.990040488188099e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 161317384.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 9.786419100711427e-09,
|
|
"advantages/std": 0.6185711622238159,
|
|
"advantages/var": 0.3826302827349224,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.662365591397849,
|
|
"grad_norm": 0.14909888159282425,
|
|
"learning_rate": 3.9810510123838924e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 161397346.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.18884867429733276,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1971
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.1950488391937882e-08,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.665232974910394,
|
|
"grad_norm": 0.13392262012664546,
|
|
"learning_rate": 3.972069157186144e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 161476175.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1972
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.668100358422939,
|
|
"grad_norm": 0.12102725236303907,
|
|
"learning_rate": 3.963094933966796e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 161550833.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1973
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016504754270957e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.670967741935484,
|
|
"grad_norm": 0.14141676697189273,
|
|
"learning_rate": 3.954128354088142e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 161634846.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 1974
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.673835125448028,
|
|
"grad_norm": 0.05737776158606781,
|
|
"learning_rate": 3.9451694289027836e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 161725619.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 8.049325332145131e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.676702508960574,
|
|
"grad_norm": 0.09190431122668229,
|
|
"learning_rate": 3.9362181697536466e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 161795519.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 1976
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.679569892473118,
|
|
"grad_norm": 0.0656476789024684,
|
|
"learning_rate": 3.927274587973934e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 161867133.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 1977
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.682437275985663,
|
|
"grad_norm": 0.09649804638539027,
|
|
"learning_rate": 3.9183386948871465e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 161954954.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 1978
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.6853046594982075,
|
|
"grad_norm": 0.1569984779356773,
|
|
"learning_rate": 3.9094105018070323e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 162035986.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 1979
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.4083154633446115e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.688172043010753,
|
|
"grad_norm": 0.043728538486143304,
|
|
"learning_rate": 3.900490020037607e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 162109285.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 5.6910394265232975,
|
|
"grad_norm": 0.08725815249044895,
|
|
"learning_rate": 3.8915772608731055e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 162182941.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1981
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.693906810035842,
|
|
"grad_norm": 0.09439298982117392,
|
|
"learning_rate": 3.882672235598002e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 162249765.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 1982
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.6967741935483875,
|
|
"grad_norm": 0.11582483962707424,
|
|
"learning_rate": 3.8737749554869723e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 162320985.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 1983
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 5.699641577060932,
|
|
"grad_norm": 0.07012001279124831,
|
|
"learning_rate": 3.864885431804882e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 162413913.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 1984
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899641578136434e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.702508960573477,
|
|
"grad_norm": 0.07596301689513589,
|
|
"learning_rate": 3.856003675806776e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 162497532.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475770542322e-09,
|
|
"advantages/std": 0.3306560218334198,
|
|
"advantages/var": 0.10933340477470299,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.705376344086021,
|
|
"grad_norm": 0.07054761515917433,
|
|
"learning_rate": 3.847129698737872e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 162576663.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 1986
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975310081633805e-09,
|
|
"advantages/std": 0.46758538484573364,
|
|
"advantages/var": 0.21863609212133284,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 5.708243727598567,
|
|
"grad_norm": 0.09447728328744381,
|
|
"learning_rate": 3.8382635118335417e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 162653726.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1987
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.6721682514236524e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.711111111111111,
|
|
"grad_norm": 0.1489684627009545,
|
|
"learning_rate": 3.8294051263192715e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 162729368.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 1988
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.34425851086805e-09,
|
|
"advantages/std": 0.5227980017662048,
|
|
"advantages/var": 0.2733177506507367,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 5.713978494623656,
|
|
"grad_norm": 0.13829378872343365,
|
|
"learning_rate": 3.820554553410693e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 162801784.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.14571532607078552,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 1989
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380984365940094,
|
|
"advantages/var": 0.05466704299203351,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.7168458781362,
|
|
"grad_norm": 0.038714326891214505,
|
|
"learning_rate": 3.8117118043135434e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 162877889.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.719713261648746,
|
|
"grad_norm": 0.08081431191174297,
|
|
"learning_rate": 3.8028768902236454e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 162945368.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 1991
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.2674839170101503e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.72258064516129,
|
|
"grad_norm": 0.05293243597736124,
|
|
"learning_rate": 3.794049822326901e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 163032316.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 1992
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.5995914574054795e-09,
|
|
"advantages/std": 0.4049588143825531,
|
|
"advantages/var": 0.1639916413461231,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 5.725448028673835,
|
|
"grad_norm": 0.07045376100312263,
|
|
"learning_rate": 3.785230611799289e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 163112655.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0867956355214119,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1993
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.2524951534513563e-09,
|
|
"advantages/std": 0.5726819038391113,
|
|
"advantages/var": 0.32796456298478915,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.7283154121863795,
|
|
"grad_norm": 0.13659353188995949,
|
|
"learning_rate": 3.7764192698068367e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 163191994.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.15756267309188843,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 1994
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983406762715241e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.731182795698925,
|
|
"grad_norm": 0.11106040148982503,
|
|
"learning_rate": 3.767615807505602e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 163276687.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46757492423057556,
|
|
"advantages/var": 0.21862630976922848,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.7340501792114695,
|
|
"grad_norm": 0.0737863864678126,
|
|
"learning_rate": 3.7588202360416677e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 163349138.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 1996
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1499147049662961e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.736917562724014,
|
|
"grad_norm": 0.10175601906137141,
|
|
"learning_rate": 3.7500325665511335e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 163426094.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 1997
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5629072505384383e-09,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.7397849462365595,
|
|
"grad_norm": 0.16061601080350418,
|
|
"learning_rate": 3.7412528101600914e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 163506487.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 1998
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.742652329749104,
|
|
"grad_norm": 0.048944587375708226,
|
|
"learning_rate": 3.7324809779846113e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 163598578.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 1999
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.0954734451444e-08,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.745519713261649,
|
|
"grad_norm": 0.17096242893421115,
|
|
"learning_rate": 3.723717081130727e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 163677199.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 5.748387096774193,
|
|
"grad_norm": 0.07559078252278754,
|
|
"learning_rate": 3.714961130694435e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 163754801.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 2001
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.751254480286739,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 3.706213137761669e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 163818241.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2002
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.754121863799283,
|
|
"grad_norm": 0.09539685136903454,
|
|
"learning_rate": 3.6974731134082814e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 163887406.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2003
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299802498719973e-09,
|
|
"advantages/std": 0.4049576222896576,
|
|
"advantages/var": 0.16399067585049298,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 5.756989247311828,
|
|
"grad_norm": 0.10087787426090282,
|
|
"learning_rate": 3.6887410687000365e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 163973057.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.08891239762306213,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2004
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.759856630824372,
|
|
"grad_norm": 0.07658307484031797,
|
|
"learning_rate": 3.6800170146926037e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 164053646.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.330655038356781,
|
|
"advantages/var": 0.10933275439072432,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.762724014336918,
|
|
"grad_norm": 0.08196801596565012,
|
|
"learning_rate": 3.671300962431524e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 164138932.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2006
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.765591397849462,
|
|
"grad_norm": 0.11316095264189938,
|
|
"learning_rate": 3.6625929229522176e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 164209748.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2007
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9875832530345343e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.768458781362007,
|
|
"grad_norm": 0.23771799442133026,
|
|
"learning_rate": 3.6538929072799516e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 164290862.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.10994865745306015,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2008
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.149905988405531e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.7713261648745515,
|
|
"grad_norm": 0.07282643843336861,
|
|
"learning_rate": 3.6452009264298435e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 164364778.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2009
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975300560494157e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.774193548387097,
|
|
"grad_norm": 0.11096671723426844,
|
|
"learning_rate": 3.636516991406824e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 164452412.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 7.31793053783837e-09,
|
|
"advantages/std": 0.5726962685585022,
|
|
"advantages/var": 0.3279810160208321,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.7770609318996415,
|
|
"grad_norm": 0.1953736314427171,
|
|
"learning_rate": 3.627841113205652e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 164537183.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.17358636856079102,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 2011
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.979139449767511e-09,
|
|
"advantages/std": 0.46761220693588257,
|
|
"advantages/var": 0.21866117607544666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.779928315412186,
|
|
"grad_norm": 0.08832565021728052,
|
|
"learning_rate": 3.619173302810874e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 164617332.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.1236182376742363,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2012
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.7827956989247316,
|
|
"grad_norm": 0.04795474434418676,
|
|
"learning_rate": 3.6105135711968313e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 164675747.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 2013
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.785663082437276,
|
|
"grad_norm": 0.0762170456763652,
|
|
"learning_rate": 3.6018619293276253e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 164750052.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2014
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227833986282349,
|
|
"advantages/var": 0.2733024818812879,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.788530465949821,
|
|
"grad_norm": 0.10339891260122874,
|
|
"learning_rate": 3.5932183881571297e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 164830891.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.13098490238189697,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.131336901697577e-10,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.791397849462365,
|
|
"grad_norm": 0.15732195307573668,
|
|
"learning_rate": 3.5845829586289454e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 164908518.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2016
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.794265232974911,
|
|
"grad_norm": 0.048387986175058725,
|
|
"learning_rate": 3.57595565167642e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 165000089.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2017
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.987588013390756e-09,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.797132616487455,
|
|
"grad_norm": 0.10363135854352874,
|
|
"learning_rate": 3.5673364782226e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 165069221.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 2018
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.8,
|
|
"grad_norm": 0.08947989574715134,
|
|
"learning_rate": 3.5587254491802467e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 165152527.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2019
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.562971027883829e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.802867383512545,
|
|
"grad_norm": 0.12634591947892704,
|
|
"learning_rate": 3.5501225754518114e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 165226762.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.12415501475334167,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.80573476702509,
|
|
"grad_norm": 0.08052528615783176,
|
|
"learning_rate": 3.5415278679294023e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 165295973.0,
|
|
"reward": 0.9921875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9921875,
|
|
"rewards/drgrpo_math_reward/std": 0.0883883461356163,
|
|
"step": 2021
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.808602150537634,
|
|
"grad_norm": 0.04711272795780744,
|
|
"learning_rate": 3.532941337494806e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 165370611.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2022
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.811469534050179,
|
|
"grad_norm": 0.10591775369957132,
|
|
"learning_rate": 3.5243629950194544e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 165446015.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 2023
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.5050695213580615e-09,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.8143369175627235,
|
|
"grad_norm": 0.11084765542165069,
|
|
"learning_rate": 3.515792851364403e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 165538784.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2024
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 8.049325332145131e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.817204301075269,
|
|
"grad_norm": 0.10179440611855767,
|
|
"learning_rate": 3.5072309173803314e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 165610071.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 5.8200716845878135,
|
|
"grad_norm": 0.04839040639140518,
|
|
"learning_rate": 3.4986772039075285e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 165692835.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2026
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.822939068100358,
|
|
"grad_norm": 0.11680509758622568,
|
|
"learning_rate": 3.4901317217758765e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 165777148.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 2027
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 5.825806451612904,
|
|
"grad_norm": 0.07852625663584593,
|
|
"learning_rate": 3.481594481804826e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 165859661.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 2028
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.816724861393605e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.828673835125448,
|
|
"grad_norm": 0.06671980747464423,
|
|
"learning_rate": 3.4730654948033955e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 165939101.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 2029
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975300560494157e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.831541218637993,
|
|
"grad_norm": 0.1531783422723006,
|
|
"learning_rate": 3.4645447715701627e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 166010891.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.834408602150537,
|
|
"grad_norm": 0.08238093946583412,
|
|
"learning_rate": 3.4560323228932363e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 166087279.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2031
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.4083154633446115e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.837275985663083,
|
|
"grad_norm": 0.0572539307371559,
|
|
"learning_rate": 3.4475281595502494e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 166162887.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2032
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.4393256658538594e-09,
|
|
"advantages/std": 0.5726926326751709,
|
|
"advantages/var": 0.3279768515204182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.840143369175627,
|
|
"grad_norm": 0.11843726952325846,
|
|
"learning_rate": 3.439032292308338e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 166233293.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.1712273508310318,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2033
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.6262294661506412e-09,
|
|
"advantages/std": 0.5726882815361023,
|
|
"advantages/var": 0.32797186780877396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.843010752688172,
|
|
"grad_norm": 0.15940737791118242,
|
|
"learning_rate": 3.4305447319241467e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 166312554.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1643974483013153,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2034
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.845878136200717,
|
|
"grad_norm": 0.16940003745396273,
|
|
"learning_rate": 3.422065489143798e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 166384633.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.848745519713262,
|
|
"grad_norm": 0.0826429917536307,
|
|
"learning_rate": 3.413594574702882e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 166452558.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2036
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.851612903225806,
|
|
"grad_norm": 0.05171982690009139,
|
|
"learning_rate": 3.405131999326439e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 166529050.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2037
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.0688864335985631e-08,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.854480286738351,
|
|
"grad_norm": 0.14120005876554986,
|
|
"learning_rate": 3.396677773728966e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 166614523.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2038
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 5.857347670250896,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 3.3882319086143705e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 166689571.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2039
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975044643629382e-09,
|
|
"advantages/std": 0.46760615706443787,
|
|
"advantages/var": 0.21865551812457173,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.860215053763441,
|
|
"grad_norm": 0.08531543441631127,
|
|
"learning_rate": 3.3797944146759914e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 166777193.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.1157275140285492,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 1.0349462367204652e-08,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 5.8630824372759855,
|
|
"grad_norm": 0.06246146558688164,
|
|
"learning_rate": 3.371365302596554e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 166869982.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2041
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.86594982078853,
|
|
"grad_norm": 0.08762273111327888,
|
|
"learning_rate": 3.362944583048184e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 166951809.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2042
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.868817204301076,
|
|
"grad_norm": 0.08145679479089268,
|
|
"learning_rate": 3.3545322666923714e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 167028854.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 2043
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.87168458781362,
|
|
"grad_norm": 0.09394169673263605,
|
|
"learning_rate": 3.3461283641799755e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 167107604.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 2044
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 7.125591088259532e-09,
|
|
"advantages/std": 0.5228043794631958,
|
|
"advantages/var": 0.2733244191858972,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.874551971326165,
|
|
"grad_norm": 0.09850840806702302,
|
|
"learning_rate": 3.337732886151192e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 167180323.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.1514892876148224,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 3.449710856633628e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.877419354838709,
|
|
"grad_norm": 0.13922270689328578,
|
|
"learning_rate": 3.329345843235565e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 167270361.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.08785156905651093,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2046
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.880286738351255,
|
|
"grad_norm": 0.06267864593183546,
|
|
"learning_rate": 3.3209672460519423e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 167354979.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2047
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975076251222236e-09,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.883154121863799,
|
|
"grad_norm": 0.12744459211179202,
|
|
"learning_rate": 3.312597105208494e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 167433194.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2048
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 7.966876488284889e-09,
|
|
"advantages/std": 0.4675973355770111,
|
|
"advantages/var": 0.21864726823871994,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 5.886021505376344,
|
|
"grad_norm": 0.09306765835885951,
|
|
"learning_rate": 3.30423543130267e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 167528659.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.11100948601961136,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 2049
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 5.888888888888889,
|
|
"grad_norm": 0.07762667693115997,
|
|
"learning_rate": 3.2958822349212137e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 167604439.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.891756272401434,
|
|
"grad_norm": 0.0957554588223651,
|
|
"learning_rate": 3.287537526640121e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 167686517.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 2051
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.894623655913978,
|
|
"grad_norm": 0.0674792368334321,
|
|
"learning_rate": 3.279201317024654e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 167771336.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2052
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.974967339721013e-09,
|
|
"advantages/std": 0.46761220693588257,
|
|
"advantages/var": 0.21866117607544666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 5.897491039426523,
|
|
"grad_norm": 0.11858480700480502,
|
|
"learning_rate": 3.270873616629306e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 167858183.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.1236182376742363,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 2053
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.900358422939068,
|
|
"grad_norm": 0.04302305827166325,
|
|
"learning_rate": 3.2625544359977963e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 167929560.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2054
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.903225806451613,
|
|
"grad_norm": 0.04371432834433029,
|
|
"learning_rate": 3.2542437856630644e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 168000634.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9916920837407456e-09,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 5.9060931899641576,
|
|
"grad_norm": 0.16814645143740356,
|
|
"learning_rate": 3.245941676147247e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 168090091.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 2056
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.9792225930357895e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.908960573476703,
|
|
"grad_norm": 0.11896140281355601,
|
|
"learning_rate": 3.237648117961664e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 168160425.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.11678344756364822,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2057
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.911827956989248,
|
|
"grad_norm": 0.046553395878668816,
|
|
"learning_rate": 3.2293631216068064e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 168230872.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2058
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 5.914695340501792,
|
|
"grad_norm": 0.09596519244867824,
|
|
"learning_rate": 3.2210866975723327e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 168314832.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2059
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599693686636838e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.917562724014337,
|
|
"grad_norm": 0.09012923952890138,
|
|
"learning_rate": 3.212818856337047e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 168400345.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5726727247238159,
|
|
"advantages/var": 0.32795404964259944,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 5.920430107526881,
|
|
"grad_norm": 0.2208097696376133,
|
|
"learning_rate": 3.204559608368881e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 168485388.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.1462520956993103,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2061
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876049603820392e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 5.923297491039427,
|
|
"grad_norm": 0.07525616983142458,
|
|
"learning_rate": 3.196308964124885e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 168575235.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2062
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.926164874551971,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 3.1880669340512257e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 168648678.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2063
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.929032258064516,
|
|
"grad_norm": 0.037728568946204376,
|
|
"learning_rate": 3.1798335285831604e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 168724461.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2064
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983467187183905e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.931899641577061,
|
|
"grad_norm": 0.11745170919584264,
|
|
"learning_rate": 3.171608758145019e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 168808795.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.934767025089606,
|
|
"grad_norm": 0.09771017719154974,
|
|
"learning_rate": 3.1633926331502046e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 168887424.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2066
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.96693437436781e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.93763440860215,
|
|
"grad_norm": 0.0842407973449496,
|
|
"learning_rate": 3.1551851640011753e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 168961848.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2067
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.126037115417672e-09,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.940501792114695,
|
|
"grad_norm": 0.17016598218151802,
|
|
"learning_rate": 3.14698636108943e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 169036398.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2068
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299846843318419e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.94336917562724,
|
|
"grad_norm": 0.09523117928043917,
|
|
"learning_rate": 3.138796234795493e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 169127303.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 2069
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.979341600636731e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 5.946236559139785,
|
|
"grad_norm": 0.08716642063708144,
|
|
"learning_rate": 3.1306147954888994e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 169216290.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380985856056213,
|
|
"advantages/var": 0.05466704996011007,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 5.94910394265233,
|
|
"grad_norm": 0.01805907462468101,
|
|
"learning_rate": 3.122442053528197e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 169290507.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2071
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262736431211962e-09,
|
|
"advantages/std": 0.5726727247238159,
|
|
"advantages/var": 0.32795404964259944,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 5.951971326164875,
|
|
"grad_norm": 0.2207721407920864,
|
|
"learning_rate": 3.1142780192609087e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 169374304.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.1462521106004715,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2072
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917668534980524e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.95483870967742,
|
|
"grad_norm": 0.13343166992067354,
|
|
"learning_rate": 3.1061227030235437e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 169443130.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2073
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.757858184220564e-09,
|
|
"advantages/std": 0.5726600289344788,
|
|
"advantages/var": 0.32793950873923805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.957706093189964,
|
|
"grad_norm": 0.1864190190940253,
|
|
"learning_rate": 3.097976115141564e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 169523072.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.13258251547813416,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2074
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.960573476702509,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 3.0898382659293896e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 169593138.0,
|
|
"reward": 1.0,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 1.0,
|
|
"rewards/drgrpo_math_reward/std": 0.0,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.0349153895649778e-08,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.963440860215054,
|
|
"grad_norm": 0.11777017743153291,
|
|
"learning_rate": 3.081709165690367e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 169675624.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2076
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.9707659080050774e-09,
|
|
"advantages/std": 0.46761417388916016,
|
|
"advantages/var": 0.2186630156220417,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.966308243727599,
|
|
"grad_norm": 0.12218652579101849,
|
|
"learning_rate": 3.0735888247167764e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 169747260.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.12597234547138214,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2077
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299846843318419e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.969175627240143,
|
|
"grad_norm": 0.08941130491646689,
|
|
"learning_rate": 3.0654772532897945e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 169830051.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2078
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 5.972043010752688,
|
|
"grad_norm": 0.07562506176953834,
|
|
"learning_rate": 3.0573744616795095e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 169914664.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 2079
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899627360122966e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.974910394265233,
|
|
"grad_norm": 0.0868540417840124,
|
|
"learning_rate": 3.0492804601448805e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 170005076.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.977777777777778,
|
|
"grad_norm": 0.11204298312949046,
|
|
"learning_rate": 3.041195258933749e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 170084455.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 2081
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.980645161290322,
|
|
"grad_norm": 0.10929001150835986,
|
|
"learning_rate": 3.033118868282802e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 170158408.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2082
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876049603820392e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.983512544802867,
|
|
"grad_norm": 0.17984310267930984,
|
|
"learning_rate": 3.0250512984175846e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 170236538.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.1054728701710701,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2083
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 5.986379928315412,
|
|
"grad_norm": 0.05927892985218351,
|
|
"learning_rate": 3.01699255955246e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 170315219.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2084
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 5.749701315113695e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.989247311827957,
|
|
"grad_norm": 0.06299542985097878,
|
|
"learning_rate": 3.008942661890627e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 170394850.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.394200364231044e-08,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 5.992114695340502,
|
|
"grad_norm": 0.1320863635374229,
|
|
"learning_rate": 3.000901615624075e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 170473179.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2086
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 5.994982078853047,
|
|
"grad_norm": 0.08514137587578359,
|
|
"learning_rate": 2.9928694309335913e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 170538199.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2087
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.2470398597593402e-08,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 5.997849462365592,
|
|
"grad_norm": 0.12653525987136202,
|
|
"learning_rate": 2.9848461179887474e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 170617091.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.12415501475334167,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2088
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.878705662607237e-09,
|
|
"advantages/std": 0.5726862549781799,
|
|
"advantages/var": 0.3279695466409329,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.002867383512545,
|
|
"grad_norm": 0.1526324127718748,
|
|
"learning_rate": 2.9768316869478836e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 170694797.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.16439256072044373,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2089
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.878773704879678e-09,
|
|
"advantages/std": 0.57267826795578,
|
|
"advantages/var": 0.3279603985888322,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 6.00573476702509,
|
|
"grad_norm": 0.1224702558266982,
|
|
"learning_rate": 2.968826147958088e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 170780478.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.1552036553621292,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 1.1266523706756892e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.008602150537635,
|
|
"grad_norm": 0.16566949778844922,
|
|
"learning_rate": 2.9608295111551904e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 170851318.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2091
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.9753001796492024e-09,
|
|
"advantages/std": 0.4675861597061157,
|
|
"advantages/var": 0.21863681674871316,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.011469534050179,
|
|
"grad_norm": 0.088479984386407,
|
|
"learning_rate": 2.952841786663757e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 170935472.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2092
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 6.014336917562724,
|
|
"grad_norm": 0.10912541991364041,
|
|
"learning_rate": 2.9448629845970675e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 171018216.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2093
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.149940263022739e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.017204301075269,
|
|
"grad_norm": 0.0651784278670309,
|
|
"learning_rate": 2.936893115057101e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 171101556.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2094
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.2649061754629257e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.020071684587814,
|
|
"grad_norm": 0.08269475298043079,
|
|
"learning_rate": 2.9289321881345254e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 171180424.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 6.022939068100358,
|
|
"grad_norm": 0.06010288134659172,
|
|
"learning_rate": 2.920980213908695e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 171262183.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 2096
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.025806451612903,
|
|
"grad_norm": 0.06993152989619203,
|
|
"learning_rate": 2.9130372024476247e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 171347311.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2097
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.028673835125448,
|
|
"grad_norm": 0.08117952217521109,
|
|
"learning_rate": 2.905103163807982e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 171427232.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2098
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.149940263022739e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.031541218637993,
|
|
"grad_norm": 0.08009694635732721,
|
|
"learning_rate": 2.8971781080350665e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 171500263.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2099
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.449667444137735e-09,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.034408602150537,
|
|
"grad_norm": 0.07636092627316961,
|
|
"learning_rate": 2.889262045162817e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 171575919.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.037275985663083,
|
|
"grad_norm": 0.26612624401000295,
|
|
"learning_rate": 2.8813549852137817e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 171668774.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2101
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.97925877074466e-09,
|
|
"advantages/std": 0.4676010012626648,
|
|
"advantages/var": 0.21865069638184664,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.040143369175627,
|
|
"grad_norm": 0.1473152837871326,
|
|
"learning_rate": 2.8734569381991083e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 171740920.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2102
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.971194650918909e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.043010752688172,
|
|
"grad_norm": 0.10967122349031659,
|
|
"learning_rate": 2.8655679141185283e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 171817552.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.09863808006048203,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 2103
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33065706491470337,
|
|
"advantages/var": 0.10933409457800636,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.045878136200717,
|
|
"grad_norm": 0.10154421462493861,
|
|
"learning_rate": 2.85768792296036e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 171892113.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.06681530922651291,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2104
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 6.048745519713262,
|
|
"grad_norm": 0.04514155771527937,
|
|
"learning_rate": 2.849816974701482e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 171970780.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.971194650918909e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.051612903225807,
|
|
"grad_norm": 0.10568532434878228,
|
|
"learning_rate": 2.841955079307319e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 172055799.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2106
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.054480286738351,
|
|
"grad_norm": 0.05230799095516218,
|
|
"learning_rate": 2.8341022467318334e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 172129948.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2107
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.8996420859237135e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.057347670250896,
|
|
"grad_norm": 0.06759943098625627,
|
|
"learning_rate": 2.8262584869175223e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 172217712.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2108
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.060215053763441,
|
|
"grad_norm": 0.13865722005142697,
|
|
"learning_rate": 2.818423809795384e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 172302415.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2109
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.975166506069069e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 6.063082437275986,
|
|
"grad_norm": 0.152552453284122,
|
|
"learning_rate": 2.810598225284928e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 172388646.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.0349462367204652e-08,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.06594982078853,
|
|
"grad_norm": 0.10379550817821541,
|
|
"learning_rate": 2.8027817432941425e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 172459177.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2111
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917668534980524e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.068817204301075,
|
|
"grad_norm": 0.12102448385517477,
|
|
"learning_rate": 2.7949743737194985e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 172541974.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2112
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.033763001694113e-09,
|
|
"advantages/std": 0.6185612082481384,
|
|
"advantages/var": 0.3826179683493969,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.07168458781362,
|
|
"grad_norm": 0.1951219789894647,
|
|
"learning_rate": 2.7871761264459225e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 172629292.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1751839816570282,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2113
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.4496412956177784e-09,
|
|
"advantages/std": 0.40496495366096497,
|
|
"advantages/var": 0.1639966136936275,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.074551971326165,
|
|
"grad_norm": 0.07114323224805799,
|
|
"learning_rate": 2.7793870113468e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 172703660.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.0936255231499672,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2114
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.50497202691776e-09,
|
|
"advantages/std": 0.5726835131645203,
|
|
"advantages/var": 0.32796640625045725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.077419354838709,
|
|
"grad_norm": 0.12900599247939767,
|
|
"learning_rate": 2.771607038283942e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 172778530.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.080286738351255,
|
|
"grad_norm": 0.07064987751422128,
|
|
"learning_rate": 2.7638362171076e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 172852560.0,
|
|
"reward": 0.9921875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9921875,
|
|
"rewards/drgrpo_math_reward/std": 0.0883883461356163,
|
|
"step": 2116
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.5933536669925965e-08,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.0831541218637994,
|
|
"grad_norm": 0.14764179501176933,
|
|
"learning_rate": 2.756074557656424e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 172930649.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2117
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.086021505376344,
|
|
"grad_norm": 0.0331028529995827,
|
|
"learning_rate": 2.748322069757476e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 173013002.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2118
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.088888888888889,
|
|
"grad_norm": 0.10707429291468022,
|
|
"learning_rate": 2.740578763226193e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 173080690.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2119
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 5.7495735248314805e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.091756272401434,
|
|
"grad_norm": 0.10277021605296317,
|
|
"learning_rate": 2.7328446478664036e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 173158696.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983355479520339e-09,
|
|
"advantages/std": 0.4676070511341095,
|
|
"advantages/var": 0.2186563542703377,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 6.094623655913979,
|
|
"grad_norm": 0.09058752972675965,
|
|
"learning_rate": 2.7251197334702835e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 173252745.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.12019838392734528,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2121
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983473280509385e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.097491039426523,
|
|
"grad_norm": 0.11113347953395288,
|
|
"learning_rate": 2.717404029818371e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 173343239.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 2122
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 6.100358422939068,
|
|
"grad_norm": 0.0808273680316428,
|
|
"learning_rate": 2.7096975466795367e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 173431614.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 2123
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.103225806451613,
|
|
"grad_norm": 0.11090880856332312,
|
|
"learning_rate": 2.7020002938109756e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 173515046.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2124
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.106093189964158,
|
|
"grad_norm": 0.06491809513310076,
|
|
"learning_rate": 2.6943122809581997e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 173601146.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.108960573476702,
|
|
"grad_norm": 0.08643256079925163,
|
|
"learning_rate": 2.6866335178550257e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 173683728.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2126
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.2044708608067508e-08,
|
|
"advantages/std": 0.6185770630836487,
|
|
"advantages/var": 0.3826375829731923,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 6.111827956989248,
|
|
"grad_norm": 0.17513950419277075,
|
|
"learning_rate": 2.678964014223553e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 173780660.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.19568344950675964,
|
|
"rewards/drgrpo_math_reward/mean": 0.6875,
|
|
"rewards/drgrpo_math_reward/std": 0.4653336703777313,
|
|
"step": 2127
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.114695340501792,
|
|
"grad_norm": 0.08235783336423096,
|
|
"learning_rate": 2.6713037797741543e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 173864945.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2128
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.117562724014337,
|
|
"grad_norm": 0.12465393811280526,
|
|
"learning_rate": 2.663652824205476e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 173926974.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2129
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 6.120430107526881,
|
|
"grad_norm": 0.04977249967066248,
|
|
"learning_rate": 2.656011157204415e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 174004946.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"advantages/mean": -7.916241884231567e-09,
|
|
"advantages/snr": 1.6929285604174867e-08,
|
|
"advantages/std": 0.4676063656806946,
|
|
"advantages/var": 0.21865571322510746,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.123297491039427,
|
|
"grad_norm": 0.10518807899595124,
|
|
"learning_rate": 2.648378788446102e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 174084487.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.11913755536079407,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2131
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983334154224049e-09,
|
|
"advantages/std": 0.46760955452919006,
|
|
"advantages/var": 0.21865869548698758,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.1261648745519715,
|
|
"grad_norm": 0.1088275661340618,
|
|
"learning_rate": 2.6407557275938955e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 174158577.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.12020328640937805,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 2132
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899474012416308e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.129032258064516,
|
|
"grad_norm": 0.0946231878141697,
|
|
"learning_rate": 2.633141984299374e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 174231906.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.08443662524223328,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2133
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 4.599796258942519e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.131899641577061,
|
|
"grad_norm": 0.09503508407204056,
|
|
"learning_rate": 2.625537568202322e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 174304214.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2134
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.134767025089606,
|
|
"grad_norm": 0.028863618471970904,
|
|
"learning_rate": 2.6179424889307043e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 174374186.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 7.041852881974499e-10,
|
|
"advantages/std": 0.661276638507843,
|
|
"advantages/var": 0.4372867926362325,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 6.137634408602151,
|
|
"grad_norm": 0.14794659570921173,
|
|
"learning_rate": 2.61035675610067e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 174466360.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.2109457552433014,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 2136
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.9750579720916185e-09,
|
|
"advantages/std": 0.4676051139831543,
|
|
"advantages/var": 0.21865454262319872,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.140501792114695,
|
|
"grad_norm": 0.11420182933445344,
|
|
"learning_rate": 2.6027803793165347e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 174546294.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.11784427613019943,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2137
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917366402546925e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.14336917562724,
|
|
"grad_norm": 0.08110650673113542,
|
|
"learning_rate": 2.595213368170772e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 174637659.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2138
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.235170151758147e-09,
|
|
"advantages/std": 0.5227810740470886,
|
|
"advantages/var": 0.27330005138182756,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.146236559139785,
|
|
"grad_norm": 0.14099708405961642,
|
|
"learning_rate": 2.5876557322439916e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 174712307.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2139
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.95842995348603e-10,
|
|
"advantages/std": 0.4676051139831543,
|
|
"advantages/var": 0.21865454262319872,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 6.14910394265233,
|
|
"grad_norm": 0.08674400597712102,
|
|
"learning_rate": 2.5801074811049315e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 174794762.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.11784426867961884,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966544250856589e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.151971326164874,
|
|
"grad_norm": 0.04589011037766571,
|
|
"learning_rate": 2.572568624310458e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 174876942.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2141
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.15483870967742,
|
|
"grad_norm": 0.06478111910034079,
|
|
"learning_rate": 2.5650391714055296e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 174958060.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2142
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.157706093189964,
|
|
"grad_norm": 0.11004705135365284,
|
|
"learning_rate": 2.5575191319232127e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 175042746.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2143
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.160573476702509,
|
|
"grad_norm": 0.09478780170202693,
|
|
"learning_rate": 2.550008515384642e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 175113961.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2144
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.163440860215053,
|
|
"grad_norm": 0.07327068747479794,
|
|
"learning_rate": 2.542507331299033e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 175193209.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 9.199247907244247e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.166308243727599,
|
|
"grad_norm": 0.1129679091727922,
|
|
"learning_rate": 2.5350155891636495e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 175281416.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 2146
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.2197141188192748e-08,
|
|
"advantages/std": 0.5726685523986816,
|
|
"advantages/var": 0.3279492709064016,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.1691756272401435,
|
|
"grad_norm": 0.12686317848230652,
|
|
"learning_rate": 2.52753329846381e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 175371588.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.14283224940299988,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2147
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.172043010752688,
|
|
"grad_norm": 0.08997490709775456,
|
|
"learning_rate": 2.5200604686728555e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 175451750.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2148
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 6.174910394265233,
|
|
"grad_norm": 0.10502798239013662,
|
|
"learning_rate": 2.5125971092521604e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 175529707.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2149
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.177777777777778,
|
|
"grad_norm": 0.07182949272215379,
|
|
"learning_rate": 2.5051432296510976e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 175607197.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 8.944085194988241e-09,
|
|
"advantages/std": 0.5726996064186096,
|
|
"advantages/var": 0.32798483919203036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.180645161290323,
|
|
"grad_norm": 0.1386335795070189,
|
|
"learning_rate": 2.4976988393070476e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 175688409.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.17912298440933228,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2151
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.2945924285545487e-08,
|
|
"advantages/std": 0.4676063656806946,
|
|
"advantages/var": 0.21865571322510746,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.183512544802867,
|
|
"grad_norm": 0.09564244720437229,
|
|
"learning_rate": 2.490263947645367e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 175773164.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.11913755536079407,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2152
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.186379928315413,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.482838564079397e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 175828254.0,
|
|
"reward": 1.0,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 1.0,
|
|
"rewards/drgrpo_math_reward/std": 0.0,
|
|
"step": 2153
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966992261291921e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.189247311827957,
|
|
"grad_norm": 0.13772180316010407,
|
|
"learning_rate": 2.4754226980104274e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 175910301.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2154
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975300560494157e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 6.192114695340502,
|
|
"grad_norm": 0.10975078879450248,
|
|
"learning_rate": 2.4680163588277113e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 175989564.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.194982078853046,
|
|
"grad_norm": 0.10427408056680003,
|
|
"learning_rate": 2.46061955590843e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 176060090.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2156
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.5050938954247684e-09,
|
|
"advantages/std": 0.5726727843284607,
|
|
"advantages/var": 0.32795411791051166,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 6.197849462365592,
|
|
"grad_norm": 0.11036299784771562,
|
|
"learning_rate": 2.4532322986176925e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 176140610.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.1462520956993103,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2157
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.200716845878136,
|
|
"grad_norm": 0.12461236277193655,
|
|
"learning_rate": 2.4458545963085255e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 176221669.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2158
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.203584229390681,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.438486458321859e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 176303479.0,
|
|
"reward": 1.0,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 1.0,
|
|
"rewards/drgrpo_math_reward/std": 0.0,
|
|
"step": 2159
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1499234216592094e-08,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 6.2064516129032254,
|
|
"grad_norm": 0.08428672676983999,
|
|
"learning_rate": 2.43112789398651e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 176392961.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.209318996415771,
|
|
"grad_norm": 0.07445476343188323,
|
|
"learning_rate": 2.423778912619171e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 176475304.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2161
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.2121863799283155,
|
|
"grad_norm": 0.09577628669796408,
|
|
"learning_rate": 2.4164395235244096e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 176548466.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2162
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.21505376344086,
|
|
"grad_norm": 0.07044764921255095,
|
|
"learning_rate": 2.409109735994647e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 176633891.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2163
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.217921146953405,
|
|
"grad_norm": 0.07468729755447735,
|
|
"learning_rate": 2.4017895593101424e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 176709801.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2164
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.9835626514248234e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.22078853046595,
|
|
"grad_norm": 0.15074373847353156,
|
|
"learning_rate": 2.3944790027389885e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 176791822.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975166506069069e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.223655913978495,
|
|
"grad_norm": 0.09859531379863297,
|
|
"learning_rate": 2.3871780755371e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 176877808.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2166
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.226523297491039,
|
|
"grad_norm": 0.09765968271834147,
|
|
"learning_rate": 2.379886786948204e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 176964248.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2167
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299829409932592e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.229390681003585,
|
|
"grad_norm": 0.07368850019587517,
|
|
"learning_rate": 2.3726051462038155e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 177038422.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2168
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.449710856633628e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.232258064516129,
|
|
"grad_norm": 0.13374477118857409,
|
|
"learning_rate": 2.3653331625232365e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 177123127.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 2169
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.971078240891425e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 6.235125448028674,
|
|
"grad_norm": 0.13838951832047827,
|
|
"learning_rate": 2.3580708451135445e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 177210861.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.449667444137735e-09,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.237992831541218,
|
|
"grad_norm": 0.07130469931870363,
|
|
"learning_rate": 2.3508182031695856e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 177291262.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2171
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814036618739657e-09,
|
|
"advantages/std": 0.5228026509284973,
|
|
"advantages/var": 0.2733226118178642,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.240860215053764,
|
|
"grad_norm": 0.12359943260731508,
|
|
"learning_rate": 2.3435752458739356e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 177377112.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.14913517236709595,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2172
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.2674839170101503e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.243727598566308,
|
|
"grad_norm": 0.11358099968057224,
|
|
"learning_rate": 2.3363419823969276e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 177457168.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2173
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599693686636838e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.246594982078853,
|
|
"grad_norm": 0.10146949140676484,
|
|
"learning_rate": 2.3291184218966163e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 177532809.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2174
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 6.2494623655913975,
|
|
"grad_norm": 0.07527906630518931,
|
|
"learning_rate": 2.3219045735187647e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 177618491.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 3.449710856633628e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.252329749103943,
|
|
"grad_norm": 0.07065746652929582,
|
|
"learning_rate": 2.31470044639685e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 177705335.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.08785156905651093,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2176
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.2998246708054356e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 6.2551971326164875,
|
|
"grad_norm": 0.0598040880718208,
|
|
"learning_rate": 2.3075060496520304e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 177782197.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2177
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.8996420859237135e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.258064516129032,
|
|
"grad_norm": 0.08774742986790071,
|
|
"learning_rate": 2.3003213923931543e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 177864611.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2178
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.958516906788102e-10,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.260931899641577,
|
|
"grad_norm": 0.10555678125680912,
|
|
"learning_rate": 2.2931464837167303e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 177938506.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2179
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.6722005458391303e-09,
|
|
"advantages/std": 0.5227840542793274,
|
|
"advantages/var": 0.27330316740873073,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.263799283154122,
|
|
"grad_norm": 0.15601084798585932,
|
|
"learning_rate": 2.2859813327069323e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 178019745.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.13204574584960938,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 9.199387373273677e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.266666666666667,
|
|
"grad_norm": 0.09967947574545993,
|
|
"learning_rate": 2.278825948435571e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 178095181.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2181
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.022367650963836e-09,
|
|
"advantages/std": 0.6185756921768188,
|
|
"advantages/var": 0.38263588695203055,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.269534050179211,
|
|
"grad_norm": 0.18715205810324997,
|
|
"learning_rate": 2.2716803399621022e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 178178090.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.19673937559127808,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2182
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975244195968941e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.272401433691757,
|
|
"grad_norm": 0.1651917055597025,
|
|
"learning_rate": 2.264544516333594e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 178251810.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2183
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814647963303764e-09,
|
|
"advantages/std": 0.5227847099304199,
|
|
"advantages/var": 0.2733038529370333,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.275268817204301,
|
|
"grad_norm": 0.12158024065271945,
|
|
"learning_rate": 2.2574184865847345e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 178317584.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.13310657441616058,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2184
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.278136200716846,
|
|
"grad_norm": 0.13190135349811286,
|
|
"learning_rate": 2.250302259737803e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 178386650.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983562397524497e-09,
|
|
"advantages/std": 0.4675827622413635,
|
|
"advantages/var": 0.2186336395452635,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.28100358422939,
|
|
"grad_norm": 0.20644704536939218,
|
|
"learning_rate": 2.2431958448026788e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 178455960.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2186
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975243815131171e-09,
|
|
"advantages/std": 0.4675905704498291,
|
|
"advantages/var": 0.2186409415735966,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.283870967741936,
|
|
"grad_norm": 0.14604786537556594,
|
|
"learning_rate": 2.236099250776805e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 178542847.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2187
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.4536934733273465e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.28673835125448,
|
|
"grad_norm": 0.11127009467183546,
|
|
"learning_rate": 2.2290124866452031e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 178622477.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2188
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.289605734767025,
|
|
"grad_norm": 0.0683364615828289,
|
|
"learning_rate": 2.2219355613804402e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 178710624.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 2189
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.29247311827957,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.2148684839426258e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 178788195.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983400669593257e-09,
|
|
"advantages/std": 0.4676017463207245,
|
|
"advantages/var": 0.21865139316219118,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.295340501792115,
|
|
"grad_norm": 0.13152233811811465,
|
|
"learning_rate": 2.2078112632794088e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 178873788.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.11336849629878998,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2191
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.2982078853046595,
|
|
"grad_norm": 0.04896511793291085,
|
|
"learning_rate": 2.2007639083259543e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 178946382.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2192
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.5933536669925965e-08,
|
|
"advantages/std": 0.4676036834716797,
|
|
"advantages/var": 0.2186532047962828,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.301075268817204,
|
|
"grad_norm": 0.3471667942809353,
|
|
"learning_rate": 2.193726428004936e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 179019434.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.11572261154651642,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2193
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983539800525091e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.3039426523297495,
|
|
"grad_norm": 0.09015407532640535,
|
|
"learning_rate": 2.186698831226521e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 179107353.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.09863808006048203,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 2194
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.562971027883829e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.306810035842294,
|
|
"grad_norm": 0.09622614826554285,
|
|
"learning_rate": 2.1796811268883707e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 179193378.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.12415502220392227,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 5.691935831188304e-09,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.309677419354839,
|
|
"grad_norm": 0.1222421270988558,
|
|
"learning_rate": 2.1726733238756212e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 179284419.0,
|
|
"reward": 0.6015625,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.6015625,
|
|
"rewards/drgrpo_math_reward/std": 0.4915000796318054,
|
|
"step": 2196
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 6.312544802867383,
|
|
"grad_norm": 0.08677875732593755,
|
|
"learning_rate": 2.165675431060866e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 179371660.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2197
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.315412186379929,
|
|
"grad_norm": 0.09056154349879256,
|
|
"learning_rate": 2.1586874573041524e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 179441110.0,
|
|
"reward": 0.9765625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9765625,
|
|
"rewards/drgrpo_math_reward/std": 0.15188287198543549,
|
|
"step": 2198
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.318279569892473,
|
|
"grad_norm": 0.11920633426040753,
|
|
"learning_rate": 2.1517094114529742e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 179504970.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2199
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.149905988405531e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.321146953405018,
|
|
"grad_norm": 0.08747785758403151,
|
|
"learning_rate": 2.1447413023422556e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 179583427.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.5050383769830774e-09,
|
|
"advantages/std": 0.5726776719093323,
|
|
"advantages/var": 0.3279597159034928,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.324014336917562,
|
|
"grad_norm": 0.12711324194947438,
|
|
"learning_rate": 2.1377831387943346e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 179677651.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.154142826795578,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 2201
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966544250856589e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.326881720430108,
|
|
"grad_norm": 0.05299389151286508,
|
|
"learning_rate": 2.1308349296189566e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 179754552.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2202
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975144418133046e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.329749103942652,
|
|
"grad_norm": 0.09902881237398599,
|
|
"learning_rate": 2.123896683613269e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 179832497.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2203
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 8.049325332145131e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.332616487455197,
|
|
"grad_norm": 0.10751438341166206,
|
|
"learning_rate": 2.116968409561809e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 179905768.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.08785156905651093,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2204
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.335483870967742,
|
|
"grad_norm": 0.04054469876989426,
|
|
"learning_rate": 2.1100501162364703e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 179977415.0,
|
|
"reward": 0.9921875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9921875,
|
|
"rewards/drgrpo_math_reward/std": 0.0883883461356163,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.338351254480287,
|
|
"grad_norm": 0.07256929125803574,
|
|
"learning_rate": 2.103141812396526e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 180047196.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 2206
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 6.3412186379928315,
|
|
"grad_norm": 0.07705694937524302,
|
|
"learning_rate": 2.096243506788602e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 180138073.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 2207
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.344086021505376,
|
|
"grad_norm": 0.05616303608045406,
|
|
"learning_rate": 2.0893552081466559e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 180213527.0,
|
|
"reward": 0.9921875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9921875,
|
|
"rewards/drgrpo_math_reward/std": 0.0883883461356163,
|
|
"step": 2208
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 8.049402934764073e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 6.3469534050179215,
|
|
"grad_norm": 0.09284175268100746,
|
|
"learning_rate": 2.082476925191977e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 180290202.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2209
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 6.349820788530466,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.0756086666331818e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 180364140.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.1383829008079518e-08,
|
|
"advantages/std": 0.5726770758628845,
|
|
"advantages/var": 0.327959033218864,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 6.352688172043011,
|
|
"grad_norm": 0.18350218141560903,
|
|
"learning_rate": 2.0687504411661895e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 180441817.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.1530819833278656,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2211
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380984365940094,
|
|
"advantages/var": 0.05466704299203351,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.355555555555555,
|
|
"grad_norm": 0.06370705177952349,
|
|
"learning_rate": 2.0619022574742118e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 180502220.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2212
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.9834382441424445e-09,
|
|
"advantages/std": 0.4675973355770111,
|
|
"advantages/var": 0.21864726823871994,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.358422939068101,
|
|
"grad_norm": 0.10124231580148321,
|
|
"learning_rate": 2.0550641242277577e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 180589674.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.11100947856903076,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2213
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.361290322580645,
|
|
"grad_norm": 0.06561819334727217,
|
|
"learning_rate": 2.0482360500845996e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 180661624.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2214
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.36415770609319,
|
|
"grad_norm": 0.07398576384984183,
|
|
"learning_rate": 2.0414180436897844e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 180745594.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675973355770111,
|
|
"advantages/var": 0.21864726823871994,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.367025089605734,
|
|
"grad_norm": 0.13403602765557324,
|
|
"learning_rate": 2.0346101136756e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 180821614.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.11100947856903076,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2216
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33063647150993347,
|
|
"advantages/var": 0.10932047629253905,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.36989247311828,
|
|
"grad_norm": 0.11993568492098934,
|
|
"learning_rate": 2.0278122686615918e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 180897012.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2217
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 7.317976997561578e-09,
|
|
"advantages/std": 0.5726926326751709,
|
|
"advantages/var": 0.3279768515204182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 6.372759856630824,
|
|
"grad_norm": 0.11492715960757334,
|
|
"learning_rate": 2.0210245172545226e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 181002163.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.1712273508310318,
|
|
"rewards/drgrpo_math_reward/mean": 0.625,
|
|
"rewards/drgrpo_math_reward/std": 0.4860251843929291,
|
|
"step": 2218
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.375627240143369,
|
|
"grad_norm": 0.11422293213003529,
|
|
"learning_rate": 2.014246868048385e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 181082317.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2219
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.378494623655914,
|
|
"grad_norm": 0.07362381212389328,
|
|
"learning_rate": 2.007479329624374e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 181159318.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.907185867989424e-10,
|
|
"advantages/std": 0.5227928161621094,
|
|
"advantages/var": 0.2733123286307091,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 6.381362007168459,
|
|
"grad_norm": 0.12756889894538026,
|
|
"learning_rate": 2.0007219105508933e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 181244107.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.1412346363067627,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 2221
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958683201273463e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.3842293906810035,
|
|
"grad_norm": 0.09031641699231324,
|
|
"learning_rate": 1.9939746193835228e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 181325577.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2222
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.2584259521113475e-09,
|
|
"advantages/std": 0.6185652613639832,
|
|
"advantages/var": 0.3826229825662928,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 6.387096774193548,
|
|
"grad_norm": 0.1594836306495329,
|
|
"learning_rate": 1.9872374646650237e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 181422169.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.1820138692855835,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 2223
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.130995003363801e-09,
|
|
"advantages/std": 0.5726990103721619,
|
|
"advantages/var": 0.32798415648125356,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.3899641577060935,
|
|
"grad_norm": 0.16746281921590245,
|
|
"learning_rate": 1.980510454925327e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 181503864.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.17806214094161987,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 2224
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.392831541218638,
|
|
"grad_norm": 0.046949961897370086,
|
|
"learning_rate": 1.97379359868152e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 181589918.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.504949684853452e-09,
|
|
"advantages/std": 0.5726854801177979,
|
|
"advantages/var": 0.32796865913775264,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.395698924731183,
|
|
"grad_norm": 0.1638038029950196,
|
|
"learning_rate": 1.967086904437828e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 181666161.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.15992169082164764,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 2226
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.379866977655094e-08,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.398566308243727,
|
|
"grad_norm": 0.08787095868807605,
|
|
"learning_rate": 1.9603903806856105e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 181759538.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 2227
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450174584180815e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 6.401433691756273,
|
|
"grad_norm": 0.049987115694361976,
|
|
"learning_rate": 1.9537040359033563e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 181840738.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2228
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.404301075268817,
|
|
"grad_norm": 0.054362269499827566,
|
|
"learning_rate": 1.947027878556665e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 181909564.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2229
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899474012416308e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.407168458781362,
|
|
"grad_norm": 0.07560767251882519,
|
|
"learning_rate": 1.9403619170982355e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 181987820.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.2998246708054356e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.410035842293907,
|
|
"grad_norm": 0.16746578952250954,
|
|
"learning_rate": 1.9337061599678538e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 182063290.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2231
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.505166341645741e-09,
|
|
"advantages/std": 0.5726664066314697,
|
|
"advantages/var": 0.32794681328419983,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.412903225806452,
|
|
"grad_norm": 0.17678767605376877,
|
|
"learning_rate": 1.927060615592394e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 182146488.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.13941732048988342,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2232
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599614475511504e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.415770609318996,
|
|
"grad_norm": 0.06865997664293769,
|
|
"learning_rate": 1.9204252923858e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 182230991.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.08785156905651093,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2233
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.96693437436781e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.418637992831541,
|
|
"grad_norm": 0.07838817614893728,
|
|
"learning_rate": 1.913800198749067e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 182317276.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2234
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966859224177393e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 6.421505376344086,
|
|
"grad_norm": 0.11167399598179967,
|
|
"learning_rate": 1.9071853430702412e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 182397868.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983361318629381e-09,
|
|
"advantages/std": 0.4676063656806946,
|
|
"advantages/var": 0.21865571322510746,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 6.424372759856631,
|
|
"grad_norm": 0.09026306625703183,
|
|
"learning_rate": 1.9005807337244107e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 182482710.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.11913755536079407,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 2236
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983266117900325e-09,
|
|
"advantages/std": 0.46761754155158997,
|
|
"advantages/var": 0.21866616516675297,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.4272401433691755,
|
|
"grad_norm": 0.11437771422820203,
|
|
"learning_rate": 1.8939863790736922e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 182559246.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.130448117852211,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2237
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.43010752688172,
|
|
"grad_norm": 0.09562689718485126,
|
|
"learning_rate": 1.8874022874672057e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 182628246.0,
|
|
"reward": 0.9765625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9765625,
|
|
"rewards/drgrpo_math_reward/std": 0.15188287198543549,
|
|
"step": 2238
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998246708054356e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 6.4329749103942655,
|
|
"grad_norm": 0.13211713898500313,
|
|
"learning_rate": 1.88082846724109e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 182708667.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2239
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899641578136434e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.43584229390681,
|
|
"grad_norm": 0.08708094307258604,
|
|
"learning_rate": 1.8742649267184796e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 182791955.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.438709677419355,
|
|
"grad_norm": 0.15321504604046224,
|
|
"learning_rate": 1.8677116742094856e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 182864080.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2241
|
|
},
|
|
{
|
|
"advantages/mean": 6.05359673500061e-09,
|
|
"advantages/snr": 1.1579395756169161e-08,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.4415770609319,
|
|
"grad_norm": 0.1586511176955155,
|
|
"learning_rate": 1.8611687180111956e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 182940100.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 2242
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.444444444444445,
|
|
"grad_norm": 0.08017374574585492,
|
|
"learning_rate": 1.8546360664076655e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 183019220.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2243
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.6721560650546284e-09,
|
|
"advantages/std": 0.5227927565574646,
|
|
"advantages/var": 0.27331226630895245,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.447311827956989,
|
|
"grad_norm": 0.09724299830917557,
|
|
"learning_rate": 1.8481137276699042e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 183105169.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.1412346363067627,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 2244
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.450179211469534,
|
|
"grad_norm": 0.09035132973576847,
|
|
"learning_rate": 1.841601710055859e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 183174377.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.563018557708836e-09,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.453046594982079,
|
|
"grad_norm": 0.09789317420625834,
|
|
"learning_rate": 1.8351000218104084e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 183247680.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.1173202320933342,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 2246
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344599133202926e-09,
|
|
"advantages/std": 0.5227646827697754,
|
|
"advantages/var": 0.2732829135513839,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 6.455913978494624,
|
|
"grad_norm": 0.1637132592879638,
|
|
"learning_rate": 1.8286086711653604e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 183327463.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.11048543453216553,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2247
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.458781362007168,
|
|
"grad_norm": 0.14268225971430398,
|
|
"learning_rate": 1.8221276663394314e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 183412394.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2248
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633244590331673e-09,
|
|
"advantages/std": 0.33065226674079895,
|
|
"advantages/var": 0.10933092150082846,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.461648745519713,
|
|
"grad_norm": 0.0952562831504514,
|
|
"learning_rate": 1.8156570155382355e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 183484205.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.06233953312039375,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2249
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.464516129032258,
|
|
"grad_norm": 0.05675163813786104,
|
|
"learning_rate": 1.8091967269542774e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 183555605.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.467383512544803,
|
|
"grad_norm": 0.09555172292089295,
|
|
"learning_rate": 1.8027468087669485e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 183615502.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 2251
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.4702508960573475,
|
|
"grad_norm": 0.09915418889669067,
|
|
"learning_rate": 1.7963072691425085e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 183692283.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2252
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.991725342260504e-09,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.473118279569892,
|
|
"grad_norm": 0.12942772670108643,
|
|
"learning_rate": 1.7898781162340682e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 183770014.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2253
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.59950277215078e-09,
|
|
"advantages/std": 0.4049666225910187,
|
|
"advantages/var": 0.16399796541277656,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.4759856630824375,
|
|
"grad_norm": 0.09790204833525104,
|
|
"learning_rate": 1.7834593581816017e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 183853983.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.09574718773365021,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2254
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33063647150993347,
|
|
"advantages/var": 0.10932047629253905,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 6.478853046594982,
|
|
"grad_norm": 0.07205758936828763,
|
|
"learning_rate": 1.7770510031119102e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 183932942.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 8.049325332145131e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.481720430107527,
|
|
"grad_norm": 0.1132707791729546,
|
|
"learning_rate": 1.770653059138626e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 184006219.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2256
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.484587813620072,
|
|
"grad_norm": 0.06045306560334811,
|
|
"learning_rate": 1.7642655343622047e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 184090803.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2257
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975144418133046e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.487455197132617,
|
|
"grad_norm": 0.12755924552355358,
|
|
"learning_rate": 1.757888436869911e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 184179938.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2258
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983473280509385e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.490322580645161,
|
|
"grad_norm": 0.13122605358577782,
|
|
"learning_rate": 1.7515217747358013e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 184261909.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2259
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 1.126665165824288e-08,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.493189964157706,
|
|
"grad_norm": 0.10031688387770546,
|
|
"learning_rate": 1.745165556020718e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 184324244.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.496057347670251,
|
|
"grad_norm": 0.07288569607456609,
|
|
"learning_rate": 1.738819788772291e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 184396666.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2261
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299811976811062e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.498924731182796,
|
|
"grad_norm": 0.1640293487561573,
|
|
"learning_rate": 1.7324844810249128e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 184479153.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2262
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.50179211469534,
|
|
"grad_norm": 0.03134484975609983,
|
|
"learning_rate": 1.7261596407997303e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 184550997.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2263
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.504659498207886,
|
|
"grad_norm": 0.12737389137270913,
|
|
"learning_rate": 1.7198452761046378e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 184623639.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 2264
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.5629543724332273e-09,
|
|
"advantages/std": 0.5227810740470886,
|
|
"advantages/var": 0.27330005138182756,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.50752688172043,
|
|
"grad_norm": 0.12360684737711228,
|
|
"learning_rate": 1.7135413949342704e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 184700828.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.0349462367204652e-08,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.510394265232975,
|
|
"grad_norm": 0.08835364361365992,
|
|
"learning_rate": 1.70724800526999e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 184778848.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2266
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.5132616487455195,
|
|
"grad_norm": 0.030580109273297615,
|
|
"learning_rate": 1.700965115079871e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 184849812.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2267
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.516129032258064,
|
|
"grad_norm": 0.075470863165079,
|
|
"learning_rate": 1.6946927323186942e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 184928483.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2268
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 9.958849501312727e-10,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.5189964157706095,
|
|
"grad_norm": 0.12383606307308048,
|
|
"learning_rate": 1.688430864927941e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 185009809.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.09863808006048203,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2269
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33065706491470337,
|
|
"advantages/var": 0.10933409457800636,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.521863799283154,
|
|
"grad_norm": 0.08853756988079814,
|
|
"learning_rate": 1.6821795208357824e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 185080483.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.06681530922651291,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.524731182795699,
|
|
"grad_norm": 0.06285731070616504,
|
|
"learning_rate": 1.675938707957053e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 185160894.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2271
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1499234216592094e-08,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.527598566308244,
|
|
"grad_norm": 0.1703460319537216,
|
|
"learning_rate": 1.6697084341932631e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 185237373.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2272
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958149428367127e-09,
|
|
"advantages/std": 0.46761828660964966,
|
|
"advantages/var": 0.21866686197174445,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.530465949820789,
|
|
"grad_norm": 0.10918731325145618,
|
|
"learning_rate": 1.6634887074325842e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 185319943.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.1315089464187622,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 2273
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.5055955396409083e-09,
|
|
"advantages/std": 0.6185742020606995,
|
|
"advantages/var": 0.38263404345503105,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 6.533333333333333,
|
|
"grad_norm": 0.13860061076008143,
|
|
"learning_rate": 1.6572795355498226e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 185408472.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.19438527524471283,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2274
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.536200716845878,
|
|
"grad_norm": 0.07592588942538149,
|
|
"learning_rate": 1.651080926406425e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 185482156.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.539068100358423,
|
|
"grad_norm": 0.12832812949240505,
|
|
"learning_rate": 1.6448928878504686e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 185561234.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2276
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.541935483870968,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.638715427716648e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 185630813.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2277
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.544802867383512,
|
|
"grad_norm": 0.06598763109472212,
|
|
"learning_rate": 1.6325485538262563e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 185707527.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2278
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917366402546925e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.547670250896058,
|
|
"grad_norm": 0.08948401778154895,
|
|
"learning_rate": 1.6263922739871882e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 185787243.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2279
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.550537634408602,
|
|
"grad_norm": 0.10329354184463344,
|
|
"learning_rate": 1.620246595993925e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 185859425.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 6.23513460695797e-09,
|
|
"advantages/std": 0.5227840542793274,
|
|
"advantages/var": 0.27330316740873073,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 6.553405017921147,
|
|
"grad_norm": 0.12655950039508182,
|
|
"learning_rate": 1.6141115276275297e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 185938711.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.13204574584960938,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 2281
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199317639730369e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.5562724014336915,
|
|
"grad_norm": 0.09904348854237273,
|
|
"learning_rate": 1.6079870766556236e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 186014383.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2282
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299846843318419e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 6.559139784946236,
|
|
"grad_norm": 0.08147217328524484,
|
|
"learning_rate": 1.6018732508323885e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 186091072.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2283
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.5620071684587815,
|
|
"grad_norm": 0.1543357073643548,
|
|
"learning_rate": 1.5957700578985557e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 186171198.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2284
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983467187183905e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.564874551971326,
|
|
"grad_norm": 0.09278990300735251,
|
|
"learning_rate": 1.5896775055813973e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 186245214.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.567741935483871,
|
|
"grad_norm": 0.05992901706913987,
|
|
"learning_rate": 1.5835956015947038e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 186326430.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2286
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.967257841033182e-09,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.570609318996416,
|
|
"grad_norm": 0.1278084908427023,
|
|
"learning_rate": 1.5775243536387907e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 186403537.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2287
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.2946288161655503e-08,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.573476702508961,
|
|
"grad_norm": 0.14354059395101898,
|
|
"learning_rate": 1.5714637694004819e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 186478551.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2288
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016540399721408e-09,
|
|
"advantages/std": 0.5227880477905273,
|
|
"advantages/var": 0.2733073429126307,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 6.576344086021505,
|
|
"grad_norm": 0.12563023202974105,
|
|
"learning_rate": 1.565413856553095e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 186555219.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2289
|
|
},
|
|
{
|
|
"advantages/mean": 4.6566128730773926e-09,
|
|
"advantages/snr": 8.907227504745508e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 6.57921146953405,
|
|
"grad_norm": 0.1043286914567863,
|
|
"learning_rate": 1.559374622756441e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 186637660.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.13781969249248505,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.582078853046595,
|
|
"grad_norm": 0.11406391941071242,
|
|
"learning_rate": 1.5533460756568128e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 186707110.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2291
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016434378286722e-09,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 6.58494623655914,
|
|
"grad_norm": 0.12958651018177703,
|
|
"learning_rate": 1.5473282228869665e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 186785620.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2292
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.587813620071684,
|
|
"grad_norm": 0.04880423815887028,
|
|
"learning_rate": 1.5413210720661186e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 186857710.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2293
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.59068100358423,
|
|
"grad_norm": 0.08622057145780773,
|
|
"learning_rate": 1.535324630799939e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 186945087.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2294
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633244590331673e-09,
|
|
"advantages/std": 0.33065226674079895,
|
|
"advantages/var": 0.10933092150082846,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.593548387096774,
|
|
"grad_norm": 0.06513710758080316,
|
|
"learning_rate": 1.5293389066805397e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 187019967.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.06233953312039375,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958906628562059e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.596415770609319,
|
|
"grad_norm": 0.12757381249902855,
|
|
"learning_rate": 1.523363907286459e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 187097472.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.09522313624620438,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2296
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.5992831541218635,
|
|
"grad_norm": 0.10909393350858522,
|
|
"learning_rate": 1.517399640182656e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 187166997.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2297
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.602150537634409,
|
|
"grad_norm": 0.055835861752719866,
|
|
"learning_rate": 1.511446112920508e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 187247103.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2298
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.979341600636731e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 6.6050179211469535,
|
|
"grad_norm": 0.10637210493636282,
|
|
"learning_rate": 1.5055033330377907e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 187327258.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 2299
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.607885304659498,
|
|
"grad_norm": 0.04125068953551882,
|
|
"learning_rate": 1.4995713080586735e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 187406269.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599693686636838e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.610752688172043,
|
|
"grad_norm": 0.11344854653970772,
|
|
"learning_rate": 1.493650045493703e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 187483942.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2301
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599796258942519e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.613620071684588,
|
|
"grad_norm": 0.09709970395074634,
|
|
"learning_rate": 1.4877395528398085e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 187568404.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2302
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917366402546925e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.616487455197133,
|
|
"grad_norm": 0.10086578741031085,
|
|
"learning_rate": 1.4818398375802833e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 187652589.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2303
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.149940263022739e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 6.619354838709677,
|
|
"grad_norm": 0.06947631908435752,
|
|
"learning_rate": 1.4759509071847632e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 187740747.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 2304
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.958683201273463e-10,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.622222222222222,
|
|
"grad_norm": 0.09733944745106135,
|
|
"learning_rate": 1.4700727691092418e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 187822220.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.625089605734767,
|
|
"grad_norm": 0.08233489562826543,
|
|
"learning_rate": 1.464205430796047e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 187907879.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2306
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917335935919526e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 6.627956989247312,
|
|
"grad_norm": 0.06916655964207967,
|
|
"learning_rate": 1.458348899673829e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 187997297.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2307
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.630824372759856,
|
|
"grad_norm": 0.08410716414331963,
|
|
"learning_rate": 1.452503183157554e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 188082060.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2308
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380985856056213,
|
|
"advantages/var": 0.05466704996011007,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.633691756272402,
|
|
"grad_norm": 0.04522199115014992,
|
|
"learning_rate": 1.4466682886485004e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 188154859.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2309
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.971078240891425e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 6.636559139784946,
|
|
"grad_norm": 0.10442753300294257,
|
|
"learning_rate": 1.4408442235342455e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 188238796.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.639426523297491,
|
|
"grad_norm": 0.055922668216063266,
|
|
"learning_rate": 1.4350309951886485e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 188319030.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2311
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.6422939068100355,
|
|
"grad_norm": 0.08318317416468163,
|
|
"learning_rate": 1.4292286109718532e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 188397441.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2312
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.4083154633446115e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.645161290322581,
|
|
"grad_norm": 0.05860914220795526,
|
|
"learning_rate": 1.4234370782302741e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 188473863.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2313
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.1950220288145723e-08,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.6480286738351255,
|
|
"grad_norm": 0.1288524821089973,
|
|
"learning_rate": 1.4176564042965867e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 188563518.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2314
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.65089605734767,
|
|
"grad_norm": 0.06351548222281105,
|
|
"learning_rate": 1.411886596489714e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 188638305.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.394200364231044e-08,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.6537634408602155,
|
|
"grad_norm": 0.11442339823270581,
|
|
"learning_rate": 1.4061276621148244e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 188710787.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2316
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494468808174133,
|
|
"advantages/var": 0.16398020040561878,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.65663082437276,
|
|
"grad_norm": 0.13153217173799472,
|
|
"learning_rate": 1.4003796084633201e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 188792398.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2317
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 7.125942055767658e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 6.659498207885305,
|
|
"grad_norm": 0.09656465916026838,
|
|
"learning_rate": 1.3946424428128278e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 188891865.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.12415501475334167,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 2318
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.662365591397849,
|
|
"grad_norm": 0.08164599281834158,
|
|
"learning_rate": 1.388916172427187e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 188968480.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2319
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899474012416308e-09,
|
|
"advantages/std": 0.4049537181854248,
|
|
"advantages/var": 0.16398751387220045,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.665232974910394,
|
|
"grad_norm": 0.1378308728977492,
|
|
"learning_rate": 1.383200804556438e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 189049978.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.08443661779165268,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.668100358422939,
|
|
"grad_norm": 0.08325008115834502,
|
|
"learning_rate": 1.3774963464368294e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 189136583.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2321
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.2649061754629257e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.670967741935484,
|
|
"grad_norm": 0.10295338191905493,
|
|
"learning_rate": 1.3718028052907848e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 189214728.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 2322
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.9835626514248234e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.673835125448028,
|
|
"grad_norm": 0.12319470024336432,
|
|
"learning_rate": 1.3661201883269159e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 189282756.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2323
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299880526045478e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.676702508960574,
|
|
"grad_norm": 0.08015989045867306,
|
|
"learning_rate": 1.3604485027399926e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 189365339.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2324
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899540529955257e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.679569892473118,
|
|
"grad_norm": 0.08966959448335038,
|
|
"learning_rate": 1.3547877557109544e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 189443727.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.682437275985663,
|
|
"grad_norm": 0.07473854451423957,
|
|
"learning_rate": 1.349137954406885e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 189527280.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2326
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.9835626514248234e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.6853046594982075,
|
|
"grad_norm": 0.0840217923546371,
|
|
"learning_rate": 1.3434991059810153e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 189604224.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2327
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.688172043010753,
|
|
"grad_norm": 0.07933183719617533,
|
|
"learning_rate": 1.3378712175727013e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 189683559.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2328
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966710959040678e-09,
|
|
"advantages/std": 0.4676070511341095,
|
|
"advantages/var": 0.2186563542703377,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.6910394265232975,
|
|
"grad_norm": 0.13014079856538852,
|
|
"learning_rate": 1.3322542963074314e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 189762737.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.12019838392734528,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2329
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016648251989223e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.693906810035842,
|
|
"grad_norm": 0.11759822323468655,
|
|
"learning_rate": 1.3266483492967984e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 189839944.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.12756997346878052,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.6967741935483875,
|
|
"grad_norm": 0.061950973774611895,
|
|
"learning_rate": 1.3210533836385085e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 189928901.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2331
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975176026781512e-09,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.699641577060932,
|
|
"grad_norm": 0.1491010784574104,
|
|
"learning_rate": 1.315469406416363e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 190001614.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2332
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 4.5995914574054795e-09,
|
|
"advantages/std": 0.4049588143825531,
|
|
"advantages/var": 0.1639916413461231,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.702508960573477,
|
|
"grad_norm": 0.07483524795785089,
|
|
"learning_rate": 1.3098964247002497e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 190089738.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.0867956355214119,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 2333
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.705376344086021,
|
|
"grad_norm": 0.07908883625271022,
|
|
"learning_rate": 1.3043344455461315e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 190159776.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2334
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 8.1312150336472e-10,
|
|
"advantages/std": 0.5726835131645203,
|
|
"advantages/var": 0.32796640625045725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 6.708243727598567,
|
|
"grad_norm": 0.15440760245013682,
|
|
"learning_rate": 1.298783475996046e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 190242855.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 4.878747807970186e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.711111111111111,
|
|
"grad_norm": 0.12867147625864356,
|
|
"learning_rate": 1.2932435230780937e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 190325072.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2336
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.713978494623656,
|
|
"grad_norm": 0.12791816007869064,
|
|
"learning_rate": 1.287714593806415e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 190407043.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2337
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.2525469477123842e-09,
|
|
"advantages/std": 0.5726727843284607,
|
|
"advantages/var": 0.32795411791051166,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 6.7168458781362,
|
|
"grad_norm": 0.14296567310307667,
|
|
"learning_rate": 1.2821966951812045e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 190494543.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.1462521106004715,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 2338
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.719713261648746,
|
|
"grad_norm": 0.05962957855453991,
|
|
"learning_rate": 1.2766898341886912e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 190572934.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2339
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.9752099207640785e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.72258064516129,
|
|
"grad_norm": 0.10095528134966746,
|
|
"learning_rate": 1.2711940178011228e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 190648783.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.725448028673835,
|
|
"grad_norm": 0.10052310936070973,
|
|
"learning_rate": 1.2657092529767644e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 190719893.0,
|
|
"reward": 0.9765625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9765625,
|
|
"rewards/drgrpo_math_reward/std": 0.15188287198543549,
|
|
"step": 2341
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.96694656101877e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.7283154121863795,
|
|
"grad_norm": 0.0944250765680734,
|
|
"learning_rate": 1.2602355466598912e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 190801481.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2342
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917335935919526e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.731182795698925,
|
|
"grad_norm": 0.18167790449043975,
|
|
"learning_rate": 1.254772905780781e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 190874609.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2343
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.7340501792114695,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.2493213372556933e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 190950231.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2344
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 6.736917562724014,
|
|
"grad_norm": 0.09988909391940451,
|
|
"learning_rate": 1.2438808479868711e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 191025493.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299897960206267e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 6.7397849462365595,
|
|
"grad_norm": 0.10481401419743164,
|
|
"learning_rate": 1.2384514448625337e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 191098496.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2346
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.742652329749104,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.2330331347568634e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 191159168.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2347
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.745519713261649,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.2276259245299957e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 191224867.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2348
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 5.633513702797483e-09,
|
|
"advantages/std": 0.33063647150993347,
|
|
"advantages/var": 0.10932047629253905,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.748387096774193,
|
|
"grad_norm": 0.07635001452023805,
|
|
"learning_rate": 1.22222982102801e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 191300309.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 2349
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.672251731040016e-09,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.751254480286739,
|
|
"grad_norm": 0.11701705073979274,
|
|
"learning_rate": 1.2168448310829292e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 191385040.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.63344972278721e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.754121863799283,
|
|
"grad_norm": 0.06459934177246711,
|
|
"learning_rate": 1.211470961512705e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 191474607.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2351
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.756989247311828,
|
|
"grad_norm": 0.052180544440312226,
|
|
"learning_rate": 1.2061082191212034e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 191551333.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2352
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.379887186086637e-08,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 6.759856630824372,
|
|
"grad_norm": 0.06945171785207692,
|
|
"learning_rate": 1.2007566106982049e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 191635601.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2353
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 1.126665165824288e-08,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.762724014336918,
|
|
"grad_norm": 0.10372258196470002,
|
|
"learning_rate": 1.1954161430193988e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 191703515.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 2354
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.765591397849462,
|
|
"grad_norm": 0.08257684873620948,
|
|
"learning_rate": 1.1900868228463601e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 191780782.0,
|
|
"reward": 0.9921875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9921875,
|
|
"rewards/drgrpo_math_reward/std": 0.0883883461356163,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.768458781362007,
|
|
"grad_norm": 0.045251743367015144,
|
|
"learning_rate": 1.1847686569265591e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 191856502.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2356
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.7713261648745515,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.1794616519933342e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 191935191.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2357
|
|
},
|
|
{
|
|
"advantages/mean": -9.778887033462524e-09,
|
|
"advantages/snr": 1.707553557573222e-08,
|
|
"advantages/std": 0.5726840496063232,
|
|
"advantages/var": 0.3279670206734977,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.774193548387097,
|
|
"grad_norm": 0.11642091771443897,
|
|
"learning_rate": 1.1741658147659029e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 192017176.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.16097761690616608,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2358
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.7770609318996415,
|
|
"grad_norm": 0.05671825721944357,
|
|
"learning_rate": 1.1688811519493325e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 192094521.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 2359
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.15796030306511e-08,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.779928315412186,
|
|
"grad_norm": 0.12421389076108073,
|
|
"learning_rate": 1.1636076702345532e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 192180594.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.4083154633446115e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.7827956989247316,
|
|
"grad_norm": 0.05446125050765403,
|
|
"learning_rate": 1.1583453762983286e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 192264455.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 2361
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.785663082437276,
|
|
"grad_norm": 0.06150132165919982,
|
|
"learning_rate": 1.1530942768032681e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 192340392.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2362
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.8788021410185465e-09,
|
|
"advantages/std": 0.5726749300956726,
|
|
"advantages/var": 0.3279565755600835,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.788530465949821,
|
|
"grad_norm": 0.17767367881423626,
|
|
"learning_rate": 1.1478543783977945e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 192414442.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.14966705441474915,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2363
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907256955369e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.791397849462365,
|
|
"grad_norm": 0.10554860283841914,
|
|
"learning_rate": 1.1426256877161645e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 192493471.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2364
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975144418133046e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.794265232974911,
|
|
"grad_norm": 0.09473947993866931,
|
|
"learning_rate": 1.1374082113784288e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 192575267.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983628920516591e-09,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 6.797132616487455,
|
|
"grad_norm": 0.10077049301848726,
|
|
"learning_rate": 1.1322019559904539e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 192669884.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2366
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.1950002462672147e-08,
|
|
"advantages/std": 0.46760955452919006,
|
|
"advantages/var": 0.21865869548698758,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.8,
|
|
"grad_norm": 0.13009809400316338,
|
|
"learning_rate": 1.1270069281438866e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 192758973.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.12020328640937805,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 2367
|
|
},
|
|
{
|
|
"advantages/mean": 7.450580596923828e-09,
|
|
"advantages/snr": 1.3010090294948019e-08,
|
|
"advantages/std": 0.5726770758628845,
|
|
"advantages/var": 0.327959033218864,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.802867383512545,
|
|
"grad_norm": 0.28086267927181563,
|
|
"learning_rate": 1.1218231344161688e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 192839763.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1530819833278656,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2368
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 4.225178756939558e-09,
|
|
"advantages/std": 0.6612661480903625,
|
|
"advantages/var": 0.4372729186102653,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.80573476702509,
|
|
"grad_norm": 0.15400359884936654,
|
|
"learning_rate": 1.1166505813705185e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 192924907.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.19833700358867645,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 2369
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917813257124117e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.808602150537634,
|
|
"grad_norm": 0.09790313334704302,
|
|
"learning_rate": 1.111489275555909e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 193009806.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.811469534050179,
|
|
"grad_norm": 0.05408816665345678,
|
|
"learning_rate": 1.1063392235070878e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 193080934.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2371
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.8143369175627235,
|
|
"grad_norm": 0.03213567979404306,
|
|
"learning_rate": 1.10120043174455e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 193148839.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2372
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.817204301075269,
|
|
"grad_norm": 0.08309195880315318,
|
|
"learning_rate": 1.09607290677453e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 193225220.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 2373
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917221686896894e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.8200716845878135,
|
|
"grad_norm": 0.15628542485796007,
|
|
"learning_rate": 1.0909566550890003e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 193297917.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2374
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.9752007807758586e-09,
|
|
"advantages/std": 0.4675939381122589,
|
|
"advantages/var": 0.21864409095933102,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.822939068100358,
|
|
"grad_norm": 0.09904807418569123,
|
|
"learning_rate": 1.0858516831656594e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 193383409.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.1065337061882019,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.505062750816392e-09,
|
|
"advantages/std": 0.5726755261421204,
|
|
"advantages/var": 0.3279572582421544,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.825806451612904,
|
|
"grad_norm": 0.20379846818685546,
|
|
"learning_rate": 1.0807579974679293e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 193462091.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.15072786808013916,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2376
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 4.8787427301770344e-09,
|
|
"advantages/std": 0.5726819038391113,
|
|
"advantages/var": 0.32796456298478915,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 6.828673835125448,
|
|
"grad_norm": 0.1290161235042563,
|
|
"learning_rate": 1.0756756044449356e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 193551892.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.15756267309188843,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2377
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 5.749701315113695e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.831541218637993,
|
|
"grad_norm": 0.11597649933266897,
|
|
"learning_rate": 1.0706045105315064e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 193630545.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2378
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.834408602150537,
|
|
"grad_norm": 0.07912867494499928,
|
|
"learning_rate": 1.0655447221481684e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 193699224.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2379
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907227504745508e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.837275985663083,
|
|
"grad_norm": 0.13735672066171117,
|
|
"learning_rate": 1.0604962457011346e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 193780696.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.840143369175627,
|
|
"grad_norm": 0.06164208877522874,
|
|
"learning_rate": 1.0554590875822921e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 193865866.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2381
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.967257841033182e-09,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.843010752688172,
|
|
"grad_norm": 0.11723371039089976,
|
|
"learning_rate": 1.0504332541691984e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 193933726.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 2382
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.979341600636731e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.845878136200717,
|
|
"grad_norm": 0.13170995957867837,
|
|
"learning_rate": 1.0454187518250734e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 194013973.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.1054728701710701,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2383
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.149905988405531e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.848745519713262,
|
|
"grad_norm": 0.09340825441817463,
|
|
"learning_rate": 1.040415586898794e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 194087202.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2384
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.851612903225806,
|
|
"grad_norm": 0.10969877961204985,
|
|
"learning_rate": 1.0354237657248788e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 194168985.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.971194650918909e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.854480286738351,
|
|
"grad_norm": 0.12611193260627374,
|
|
"learning_rate": 1.0304432946234831e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 194247966.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2386
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.857347670250896,
|
|
"grad_norm": 0.06003868787153907,
|
|
"learning_rate": 1.0254741799003975e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 194322091.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2387
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299807237755752e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 6.860215053763441,
|
|
"grad_norm": 0.07350127118769947,
|
|
"learning_rate": 1.0205164278470258e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 194400704.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2388
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.8630824372759855,
|
|
"grad_norm": 0.07557750424369361,
|
|
"learning_rate": 1.0155700447403958e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 194478730.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2389
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299897960206267e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 6.86594982078853,
|
|
"grad_norm": 0.08613156187088937,
|
|
"learning_rate": 1.0106350368431304e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 194566159.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.868817204301076,
|
|
"grad_norm": 0.04729672915690954,
|
|
"learning_rate": 1.0057114104034604e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 194641042.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2391
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.87168458781362,
|
|
"grad_norm": 0.03435381276349522,
|
|
"learning_rate": 1.0007991716551967e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 194717110.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2392
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983629174425397e-09,
|
|
"advantages/std": 0.46757492423057556,
|
|
"advantages/var": 0.21862630976922848,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 6.874551971326165,
|
|
"grad_norm": 0.1423937645833672,
|
|
"learning_rate": 9.958983268177423e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 194799789.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2393
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.0349462367204652e-08,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 6.877419354838709,
|
|
"grad_norm": 0.05553013396548312,
|
|
"learning_rate": 9.91008882096065e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 194881420.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 2394
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.880286738351255,
|
|
"grad_norm": 0.0711328390587382,
|
|
"learning_rate": 9.861308436807058e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 194966089.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 7.966544250856589e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.883154121863799,
|
|
"grad_norm": 0.07308119092273002,
|
|
"learning_rate": 9.812642177477582e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 195042405.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2396
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.0224013087023915e-09,
|
|
"advantages/std": 0.6185722351074219,
|
|
"advantages/var": 0.3826316100457916,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.886021505376344,
|
|
"grad_norm": 0.14216845633733094,
|
|
"learning_rate": 9.76409010458874e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 195124172.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.190970316529274,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2397
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.2998806953079044e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.888888888888889,
|
|
"grad_norm": 0.13111727040757493,
|
|
"learning_rate": 9.715652279612385e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 195194271.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2398
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966544250856589e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.891756272401434,
|
|
"grad_norm": 0.06978758286791242,
|
|
"learning_rate": 9.667328763875815e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 195276428.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2399
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.894623655913978,
|
|
"grad_norm": 0.01962447601162728,
|
|
"learning_rate": 9.619119618561511e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 195349877.0,
|
|
"reward": 0.9921875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9921875,
|
|
"rewards/drgrpo_math_reward/std": 0.0883883461356163,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.897491039426523,
|
|
"grad_norm": 0.09527564749609127,
|
|
"learning_rate": 9.571024904707237e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 195426482.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2401
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983334154224049e-09,
|
|
"advantages/std": 0.46760955452919006,
|
|
"advantages/var": 0.21865869548698758,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.900358422939068,
|
|
"grad_norm": 0.0949697787491498,
|
|
"learning_rate": 9.523044683205816e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 195508115.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.12020329385995865,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2402
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.903225806451613,
|
|
"grad_norm": 0.07750667496085199,
|
|
"learning_rate": 9.47517901480509e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 195581496.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2403
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380985856056213,
|
|
"advantages/var": 0.05466704996011007,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.9060931899641576,
|
|
"grad_norm": 0.03105968279643574,
|
|
"learning_rate": 9.427427960107948e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 195657265.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2404
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814855139419146e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.908960573476703,
|
|
"grad_norm": 0.12653924761950092,
|
|
"learning_rate": 9.379791579572116e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 195744986.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.12415501475334167,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.911827956989248,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 9.332269933510118e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 195810789.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2406
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 5.7495735248314805e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.914695340501792,
|
|
"grad_norm": 0.10262501963713686,
|
|
"learning_rate": 9.284863082089222e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 195888792.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2407
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.96238646402201e-09,
|
|
"advantages/std": 0.4676155745983124,
|
|
"advantages/var": 0.21866432560690985,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 6.917562724014337,
|
|
"grad_norm": 0.09030280936181079,
|
|
"learning_rate": 9.237571085331375e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 195978064.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.12809401750564575,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2408
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5726727247238159,
|
|
"advantages/var": 0.32795404964259944,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.920430107526881,
|
|
"grad_norm": 0.126383410799034,
|
|
"learning_rate": 9.190394003113122e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 196061832.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.1462521106004715,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2409
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.516766163250374e-09,
|
|
"advantages/std": 0.6185770034790039,
|
|
"advantages/var": 0.3826375092330636,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 6.923297491039427,
|
|
"grad_norm": 0.16998489866338543,
|
|
"learning_rate": 9.143331895165451e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 196146156.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.19568344950675964,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 6.926164874551971,
|
|
"grad_norm": 0.07979723427059401,
|
|
"learning_rate": 9.0963848210738e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 196227546.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2411
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.929032258064516,
|
|
"grad_norm": 0.01980690782337147,
|
|
"learning_rate": 9.049552840278008e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 196306204.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2412
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344599133202926e-09,
|
|
"advantages/std": 0.5227646827697754,
|
|
"advantages/var": 0.2732829135513839,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.931899641577061,
|
|
"grad_norm": 0.16532044244601588,
|
|
"learning_rate": 9.002836012072168e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 196391558.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.11048543453216553,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2413
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998470125758874e-09,
|
|
"advantages/std": 0.40494978427886963,
|
|
"advantages/var": 0.16398432778750305,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.934767025089606,
|
|
"grad_norm": 0.1256327719558024,
|
|
"learning_rate": 8.956234395604556e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 196468912.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2414
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599693686636838e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.93763440860215,
|
|
"grad_norm": 0.09539153420367744,
|
|
"learning_rate": 8.909748049877586e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 196554755.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.940501792114695,
|
|
"grad_norm": 0.06816916822524462,
|
|
"learning_rate": 8.863377033747754e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 196619149.0,
|
|
"reward": 0.9921875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9921875,
|
|
"rewards/drgrpo_math_reward/std": 0.0883883461356163,
|
|
"step": 2416
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.94336917562724,
|
|
"grad_norm": 0.11626064535405715,
|
|
"learning_rate": 8.817121405925543e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 196697082.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2417
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899421713267256e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.946236559139785,
|
|
"grad_norm": 0.1246901997257576,
|
|
"learning_rate": 8.770981224975283e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 196762401.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2418
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33063647150993347,
|
|
"advantages/var": 0.10932047629253905,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.94910394265233,
|
|
"grad_norm": 0.06247756457977004,
|
|
"learning_rate": 8.724956549315177e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 196837633.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2419
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.951971326164875,
|
|
"grad_norm": 0.041928667114635366,
|
|
"learning_rate": 8.679047437217202e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 196905063.0,
|
|
"reward": 0.9921875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9921875,
|
|
"rewards/drgrpo_math_reward/std": 0.0883883461356163,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.504861672153852e-09,
|
|
"advantages/std": 0.5726932287216187,
|
|
"advantages/var": 0.3279775342235922,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 6.95483870967742,
|
|
"grad_norm": 0.1198628919772755,
|
|
"learning_rate": 8.633253946806974e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 196990922.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.172288179397583,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2421
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998806953079044e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.957706093189964,
|
|
"grad_norm": 0.06730846439692623,
|
|
"learning_rate": 8.587576136063767e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 197078931.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 2422
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.126037115417672e-09,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.960573476702509,
|
|
"grad_norm": 0.11127546833146218,
|
|
"learning_rate": 8.542014062820369e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 197170175.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2423
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.963440860215054,
|
|
"grad_norm": 0.08439475203157298,
|
|
"learning_rate": 8.496567784763032e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 197235679.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2424
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.966308243727599,
|
|
"grad_norm": 0.08537979412922808,
|
|
"learning_rate": 8.451237359431396e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 197302221.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 6.969175627240143,
|
|
"grad_norm": 0.06154497728748271,
|
|
"learning_rate": 8.406022844218452e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 197385198.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2426
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 6.972043010752688,
|
|
"grad_norm": 0.06944545136096203,
|
|
"learning_rate": 8.360924296370375e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 197461982.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2427
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 6.974910394265233,
|
|
"grad_norm": 0.07255534103341262,
|
|
"learning_rate": 8.31594177298659e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 197548845.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2428
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 5.7495735248314805e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.977777777777778,
|
|
"grad_norm": 0.10179773967799725,
|
|
"learning_rate": 8.271075331019539e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 197630688.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2429
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.0689055673126508e-08,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.980645161290322,
|
|
"grad_norm": 0.18752062690561613,
|
|
"learning_rate": 8.226325027274783e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 197709725.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814855139419146e-09,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 6.983512544802867,
|
|
"grad_norm": 0.1547020490210697,
|
|
"learning_rate": 8.181690918410755e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 197789203.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.12415502220392227,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2431
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199591840825068e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 6.986379928315412,
|
|
"grad_norm": 0.10691789449904196,
|
|
"learning_rate": 8.13717306093884e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 197878737.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 2432
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 1.0349153895649778e-08,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 6.989247311827957,
|
|
"grad_norm": 0.14593278899579626,
|
|
"learning_rate": 8.092771511223185e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 197957783.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 2433
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 6.992114695340502,
|
|
"grad_norm": 0.09489214535621193,
|
|
"learning_rate": 8.04848632548073e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 198036099.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2434
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 6.994982078853047,
|
|
"grad_norm": 0.05444184447137259,
|
|
"learning_rate": 8.004317559781048e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 198126276.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 1.126665165824288e-08,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 6.997849462365592,
|
|
"grad_norm": 0.10842610808272253,
|
|
"learning_rate": 7.960265270046306e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 198188792.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 2436
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.0539042852721367e-08,
|
|
"advantages/std": 0.6185815930366516,
|
|
"advantages/var": 0.3826431872437617,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 7.002867383512545,
|
|
"grad_norm": 0.15088482111626852,
|
|
"learning_rate": 7.916329512051234e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 198274154.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.20357416570186615,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 2437
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 7.00573476702509,
|
|
"grad_norm": 0.11637849863941437,
|
|
"learning_rate": 7.872510341423021e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 198359529.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2438
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.008602150537635,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.828807813641225e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 198436506.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2439
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.562997839424082e-09,
|
|
"advantages/std": 0.5227746963500977,
|
|
"advantages/var": 0.2732933831439368,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.011469534050179,
|
|
"grad_norm": 0.16305711932052788,
|
|
"learning_rate": 7.785221984037694e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 198507930.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.12179599702358246,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7814455009491016e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.014336917562724,
|
|
"grad_norm": 0.11316457630747334,
|
|
"learning_rate": 7.741752907796583e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 198585558.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2441
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.017204301075269,
|
|
"grad_norm": 0.05597624489885627,
|
|
"learning_rate": 7.698400639954216e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 198664801.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2442
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.020071684587814,
|
|
"grad_norm": 0.07938407607405072,
|
|
"learning_rate": 7.655165235398986e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 198737953.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2443
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5628597236829876e-09,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.022939068100358,
|
|
"grad_norm": 0.16381168187844106,
|
|
"learning_rate": 7.612046748871326e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 198822217.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2444
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.025806451612903,
|
|
"grad_norm": 0.08935558920486494,
|
|
"learning_rate": 7.56904523496369e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 198897359.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975144418133046e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.028673835125448,
|
|
"grad_norm": 0.091064230207462,
|
|
"learning_rate": 7.526160748120414e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 198971639.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2446
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 1.1266523706756892e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.031541218637993,
|
|
"grad_norm": 0.08820505386920832,
|
|
"learning_rate": 7.483393342637634e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 199044117.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2447
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.034408602150537,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 7.440743072663258e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 199114910.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2448
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.344329800322181e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.037275985663083,
|
|
"grad_norm": 0.1262476234721405,
|
|
"learning_rate": 7.398209992196913e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 199205873.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 2449
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.6262492693233955e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.040143369175627,
|
|
"grad_norm": 0.14993712851375646,
|
|
"learning_rate": 7.355794155089856e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 199296915.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.301018914496957e-08,
|
|
"advantages/std": 0.5726727247238159,
|
|
"advantages/var": 0.32795404964259944,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.043010752688172,
|
|
"grad_norm": 0.1454220498452137,
|
|
"learning_rate": 7.313495615044873e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 199378036.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.1462520956993103,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2451
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.4083154633446115e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.045878136200717,
|
|
"grad_norm": 0.06542823423939587,
|
|
"learning_rate": 7.271314425616226e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 199442570.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2452
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917003347966285e-09,
|
|
"advantages/std": 0.4676017463207245,
|
|
"advantages/var": 0.21865139316219118,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.048745519713262,
|
|
"grad_norm": 0.10509837429756382,
|
|
"learning_rate": 7.22925064020966e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 199522282.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.11336849629878998,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2453
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975343977137235e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.051612903225807,
|
|
"grad_norm": 0.08909232513147333,
|
|
"learning_rate": 7.187304312082243e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 199610659.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2454
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966992261291921e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.054480286738351,
|
|
"grad_norm": 0.11894177574843282,
|
|
"learning_rate": 7.145475494342301e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 199688065.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.126132177603901e-09,
|
|
"advantages/std": 0.5227646827697754,
|
|
"advantages/var": 0.2732829135513839,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.057347670250896,
|
|
"grad_norm": 0.12626331846484162,
|
|
"learning_rate": 7.103764239949405e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 199757273.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.11048543453216553,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2456
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.0954489382432772e-08,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.060215053763441,
|
|
"grad_norm": 0.14694301392642373,
|
|
"learning_rate": 7.062170601714301e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 199838388.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 2457
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876049603820392e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.063082437275986,
|
|
"grad_norm": 0.1048089583277469,
|
|
"learning_rate": 7.020694632298784e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 199912824.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2458
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.06594982078853,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.979336384215695e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 199985682.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2459
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 7.068817204301075,
|
|
"grad_norm": 0.1195769342952238,
|
|
"learning_rate": 6.938095909828789e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 200073777.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958906628562059e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.07168458781362,
|
|
"grad_norm": 0.06898870735602258,
|
|
"learning_rate": 6.896973261352778e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 200152062.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2461
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.074551971326165,
|
|
"grad_norm": 0.03610014767434947,
|
|
"learning_rate": 6.855968490853104e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 200226868.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 2462
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876548503938182e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.077419354838709,
|
|
"grad_norm": 0.12298941786189987,
|
|
"learning_rate": 6.815081650246047e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 200308022.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2463
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.3278497974719924e-09,
|
|
"advantages/std": 0.7013764381408691,
|
|
"advantages/var": 0.49192890797917244,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.080286738351255,
|
|
"grad_norm": 0.21915388336290187,
|
|
"learning_rate": 6.774312791298509e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 200394472.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.21937325596809387,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 2464
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.0831541218637994,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.73366196562808e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 200460705.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 6.971038697911764e-09,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 7.086021505376344,
|
|
"grad_norm": 0.09577246218740154,
|
|
"learning_rate": 6.693129224702831e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 200544319.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 2466
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633261853378446e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.088888888888889,
|
|
"grad_norm": 0.09586888057076874,
|
|
"learning_rate": 6.652714619841404e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 200619383.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2467
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 7.091756272401434,
|
|
"grad_norm": 0.04584529705030624,
|
|
"learning_rate": 6.6124182022128e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 200695502.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2468
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33065706491470337,
|
|
"advantages/var": 0.10933409457800636,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.094623655913979,
|
|
"grad_norm": 0.07981054314409838,
|
|
"learning_rate": 6.572240022836451e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 200764514.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.06681530922651291,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2469
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.2674839170101503e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.097491039426523,
|
|
"grad_norm": 0.06356528690336953,
|
|
"learning_rate": 6.53218013258201e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 200829199.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.100358422939068,
|
|
"grad_norm": 0.05365157869329496,
|
|
"learning_rate": 6.492238582169451e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 200917552.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2471
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876051507987635e-09,
|
|
"advantages/std": 0.4675931930541992,
|
|
"advantages/var": 0.21864339419062162,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.103225806451613,
|
|
"grad_norm": 0.14637648913578724,
|
|
"learning_rate": 6.452415422168845e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 201004050.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 2472
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.23380985856056213,
|
|
"advantages/var": 0.05466704996011007,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.106093189964158,
|
|
"grad_norm": 0.04838502213851617,
|
|
"learning_rate": 6.412710703000367e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 201073736.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.033407654613256454,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2473
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.108960573476702,
|
|
"grad_norm": 0.036619862059781794,
|
|
"learning_rate": 6.37312447493431e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 201151569.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2474
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966992261291921e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.111827956989248,
|
|
"grad_norm": 0.11462852942974504,
|
|
"learning_rate": 6.33365678809088e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 201222822.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599796258942519e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.114695340501792,
|
|
"grad_norm": 0.08145859769459904,
|
|
"learning_rate": 6.294307692440215e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 201304069.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2476
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.7248074468154485e-08,
|
|
"advantages/std": 0.4049680531024933,
|
|
"advantages/var": 0.16399912403362382,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.117562724014337,
|
|
"grad_norm": 0.07785665518342678,
|
|
"learning_rate": 6.255077237802286e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 201381240.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.09704046696424484,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2477
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 1.1266652673742488e-08,
|
|
"advantages/std": 0.33064746856689453,
|
|
"advantages/var": 0.10932774846969551,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.120430107526881,
|
|
"grad_norm": 0.07746058409982864,
|
|
"learning_rate": 6.215965473846896e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 201459479.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2478
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.149914704966296e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.123297491039427,
|
|
"grad_norm": 0.07708215383076802,
|
|
"learning_rate": 6.176972450093543e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 201540054.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2479
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.1261648745519715,
|
|
"grad_norm": 0.059661576972812075,
|
|
"learning_rate": 6.138098215911391e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 201628433.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.129032258064516,
|
|
"grad_norm": 0.056466482049052524,
|
|
"learning_rate": 6.099342820519183e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 201706500.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2481
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.4393170339684606e-09,
|
|
"advantages/std": 0.5726946592330933,
|
|
"advantages/var": 0.3279791727141088,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.131899641577061,
|
|
"grad_norm": 0.11187153366748942,
|
|
"learning_rate": 6.060706312985253e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 201798919.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.17123225331306458,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 2482
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 7.041969672843697e-10,
|
|
"advantages/std": 0.6612656712532043,
|
|
"advantages/var": 0.4372722879779509,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.134767025089606,
|
|
"grad_norm": 0.17394203671347708,
|
|
"learning_rate": 6.022188742227374e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 201878505.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.19727617502212524,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2483
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.137634408602151,
|
|
"grad_norm": 0.06380423232231565,
|
|
"learning_rate": 5.983790157012736e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 201960377.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2484
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.816724861393605e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.140501792114695,
|
|
"grad_norm": 0.042265642867532593,
|
|
"learning_rate": 5.9455106059578596e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 202035597.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.7815011540266774e-09,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.14336917562724,
|
|
"grad_norm": 0.11748549556727116,
|
|
"learning_rate": 5.907350137528622e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 202109527.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 2486
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.991685990697206e-09,
|
|
"advantages/std": 0.4676051139831543,
|
|
"advantages/var": 0.21865454262319872,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.146236559139785,
|
|
"grad_norm": 0.10056455570577012,
|
|
"learning_rate": 5.8693088000400736e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 202187940.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.11784426867961884,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2487
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 7.14910394265233,
|
|
"grad_norm": 0.07523463956060042,
|
|
"learning_rate": 5.8313866416564436e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 202273211.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2488
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983562397524497e-09,
|
|
"advantages/std": 0.4675827622413635,
|
|
"advantages/var": 0.2186336395452635,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.151971326164874,
|
|
"grad_norm": 0.13710367582215893,
|
|
"learning_rate": 5.793583710391059e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 202354516.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2489
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199023822625729e-09,
|
|
"advantages/std": 0.4049658179283142,
|
|
"advantages/var": 0.16399731369034853,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.15483870967742,
|
|
"grad_norm": 0.18112412261793998,
|
|
"learning_rate": 5.755900054106333e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 202434011.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.094686359167099,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.157706093189964,
|
|
"grad_norm": 0.08031875099903989,
|
|
"learning_rate": 5.718335720513601e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 202498509.0,
|
|
"reward": 0.9765625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9765625,
|
|
"rewards/drgrpo_math_reward/std": 0.15188287198543549,
|
|
"step": 2491
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 7.160573476702509,
|
|
"grad_norm": 0.057577147177145155,
|
|
"learning_rate": 5.680890757173207e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 202581140.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2492
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 7.163440860215053,
|
|
"grad_norm": 0.08395981953565397,
|
|
"learning_rate": 5.643565211494283e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 202663619.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2493
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599761390615809e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.166308243727599,
|
|
"grad_norm": 0.10697476091480708,
|
|
"learning_rate": 5.606359130734806e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 202743150.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2494
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.1691756272401435,
|
|
"grad_norm": 0.05948306440981583,
|
|
"learning_rate": 5.56927256200147e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 202822842.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.149914704966296e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 7.172043010752688,
|
|
"grad_norm": 0.09644223746896867,
|
|
"learning_rate": 5.532305552249705e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 202904984.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2496
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.4082993428404723e-08,
|
|
"advantages/std": 0.330655038356781,
|
|
"advantages/var": 0.10933275439072432,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.174910394265233,
|
|
"grad_norm": 0.06960546121549602,
|
|
"learning_rate": 5.495458148283505e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 202983316.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2497
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983361318629381e-09,
|
|
"advantages/std": 0.4676063656806946,
|
|
"advantages/var": 0.21865571322510746,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 7.177777777777778,
|
|
"grad_norm": 0.09059362245450198,
|
|
"learning_rate": 5.4587303967554954e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 203064975.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.11913755536079407,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2498
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.0349462367204652e-08,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.180645161290323,
|
|
"grad_norm": 0.07122944248895587,
|
|
"learning_rate": 5.422122344166735e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 203154506.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2499
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.183512544802867,
|
|
"grad_norm": 0.09372971274196308,
|
|
"learning_rate": 5.385634036866793e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 203233907.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.186379928315413,
|
|
"grad_norm": 0.10943414932965682,
|
|
"learning_rate": 5.349265521053603e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 203308143.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2501
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.189247311827957,
|
|
"grad_norm": 0.14114137293314274,
|
|
"learning_rate": 5.3130168427734434e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 203373971.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2502
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 8.962749759103603e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.192114695340502,
|
|
"grad_norm": 0.15433333711897534,
|
|
"learning_rate": 5.2768880479208356e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 203461764.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2503
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.194982078853046,
|
|
"grad_norm": 0.09641316610306644,
|
|
"learning_rate": 5.2408791822385664e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 203528589.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2504
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227949619293213,
|
|
"advantages/var": 0.2733145722186805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.197849462365592,
|
|
"grad_norm": 0.12211529528959136,
|
|
"learning_rate": 5.204990291317535e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 203608123.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.14123955368995667,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.200716845878136,
|
|
"grad_norm": 0.09335209234175323,
|
|
"learning_rate": 5.1692214205967476e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 203674869.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2506
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.203584229390681,
|
|
"grad_norm": 0.11655707639584922,
|
|
"learning_rate": 5.133572615363269e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 203753097.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2507
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907227504745508e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 7.2064516129032254,
|
|
"grad_norm": 0.12189736363683759,
|
|
"learning_rate": 5.0980439207521485e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 203832120.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2508
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.209318996415771,
|
|
"grad_norm": 0.054068834445732794,
|
|
"learning_rate": 5.0626353817463606e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 203915326.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 2509
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 8.049325332145131e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.2121863799283155,
|
|
"grad_norm": 0.09532110290015222,
|
|
"learning_rate": 5.027347043176722e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 203983549.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.08785156905651093,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599751573415311e-09,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.21505376344086,
|
|
"grad_norm": 0.06153929717244519,
|
|
"learning_rate": 4.9921789497218925e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 204062587.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 2511
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.217921146953405,
|
|
"grad_norm": 0.06708426684173453,
|
|
"learning_rate": 4.957131145908311e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 204135455.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2512
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.22078853046595,
|
|
"grad_norm": 0.049674688426611074,
|
|
"learning_rate": 4.9222036761100595e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 204205206.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2513
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.0349462367204652e-08,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.223655913978495,
|
|
"grad_norm": 0.06878477145620411,
|
|
"learning_rate": 4.887396584548909e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 204290056.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 2514
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.3798774372216438e-08,
|
|
"advantages/std": 0.4049588143825531,
|
|
"advantages/var": 0.1639916413461231,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.226523297491039,
|
|
"grad_norm": 0.11566865259735848,
|
|
"learning_rate": 4.852709915294195e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 204355728.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.0867956355214119,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.229390681003585,
|
|
"grad_norm": 0.08328792189926526,
|
|
"learning_rate": 4.818143712262812e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 204438845.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2516
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966992261291921e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 7.232258064516129,
|
|
"grad_norm": 0.07904085007979987,
|
|
"learning_rate": 4.783698019219118e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 204523569.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 2517
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.235125448028674,
|
|
"grad_norm": 0.07040339549726506,
|
|
"learning_rate": 4.7493728797748713e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 204608401.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2518
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.9835626514248234e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.237992831541218,
|
|
"grad_norm": 0.12930695883543628,
|
|
"learning_rate": 4.7151683373892306e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 204693577.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 2519
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.757231577566731e-09,
|
|
"advantages/std": 0.5726968050003052,
|
|
"advantages/var": 0.32798163045755757,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.240860215053764,
|
|
"grad_norm": 0.14847926465783368,
|
|
"learning_rate": 4.681084435368665e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 204779642.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.17464719712734222,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.243727598566308,
|
|
"grad_norm": 0.09385543760658205,
|
|
"learning_rate": 4.647121216866856e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 204848826.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2521
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016648251989223e-09,
|
|
"advantages/std": 0.5227810144424438,
|
|
"advantages/var": 0.2732999890614707,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.246594982078853,
|
|
"grad_norm": 0.12028791578704512,
|
|
"learning_rate": 4.61327872488475e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 204930284.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.12756995856761932,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2522
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.2494623655913975,
|
|
"grad_norm": 0.0775232523613318,
|
|
"learning_rate": 4.5795570022703954e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 205015997.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2523
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.252329749103943,
|
|
"grad_norm": 0.09731319644944193,
|
|
"learning_rate": 4.54595609171895e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 205089865.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2524
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.2551971326164875,
|
|
"grad_norm": 0.09186376937235984,
|
|
"learning_rate": 4.512476035772628e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 205163985.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.6721527138226332e-09,
|
|
"advantages/std": 0.5227934122085571,
|
|
"advantages/var": 0.27331295184866633,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.258064516129032,
|
|
"grad_norm": 0.14501961184136447,
|
|
"learning_rate": 4.479116876820588e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 205249213.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.1422954797744751,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2526
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.260931899641577,
|
|
"grad_norm": 0.10031695362138324,
|
|
"learning_rate": 4.445878657098978e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 205328564.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 2527
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.1383701047106082e-08,
|
|
"advantages/std": 0.5726835131645203,
|
|
"advantages/var": 0.32796640625045725,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.263799283154122,
|
|
"grad_norm": 0.15816144497770732,
|
|
"learning_rate": 4.412761418690747e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 205421127.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.15991678833961487,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2528
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.2649061754629257e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.266666666666667,
|
|
"grad_norm": 0.06483258155619981,
|
|
"learning_rate": 4.3797652035257536e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 205505505.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2529
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.269534050179211,
|
|
"grad_norm": 0.11694285020397602,
|
|
"learning_rate": 4.3468900533805694e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 205575428.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.272401433691757,
|
|
"grad_norm": 0.017510490695550458,
|
|
"learning_rate": 4.314136009878511e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 205647145.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2531
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.449710856633628e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.275268817204301,
|
|
"grad_norm": 0.07153917715055116,
|
|
"learning_rate": 4.2815031144895484e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 205718945.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2532
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.1950488391937882e-08,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.278136200716846,
|
|
"grad_norm": 0.17714243005397393,
|
|
"learning_rate": 4.248991408530278e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 205788402.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 2533
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.28100358422939,
|
|
"grad_norm": 0.10279823826708101,
|
|
"learning_rate": 4.2166009331638494e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 205858936.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 2534
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.3942077395823529e-08,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.283870967741936,
|
|
"grad_norm": 0.09854792207558746,
|
|
"learning_rate": 4.184331729399937e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 205939534.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.28673835125448,
|
|
"grad_norm": 0.0528413826752298,
|
|
"learning_rate": 4.152183838094636e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 206021897.0,
|
|
"reward": 0.9921875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9921875,
|
|
"rewards/drgrpo_math_reward/std": 0.0883883461356163,
|
|
"step": 2536
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.1579239975137157e-08,
|
|
"advantages/std": 0.5227974057197571,
|
|
"advantages/var": 0.2733171274273083,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.289605734767025,
|
|
"grad_norm": 0.12729764081841807,
|
|
"learning_rate": 4.1201572999504995e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 206104077.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.14465448260307312,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2537
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.449667444137735e-09,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.29247311827957,
|
|
"grad_norm": 0.12420338671443103,
|
|
"learning_rate": 4.088252155516403e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 206178062.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2538
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814513910737996e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.295340501792115,
|
|
"grad_norm": 0.15547905086589858,
|
|
"learning_rate": 4.0564684451874996e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 206269177.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2539
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.2982078853046595,
|
|
"grad_norm": 0.09306918284991635,
|
|
"learning_rate": 4.024806209205256e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 206344200.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.301075268817204,
|
|
"grad_norm": 0.0476275501816316,
|
|
"learning_rate": 3.9932654876573155e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 206409750.0,
|
|
"reward": 0.9765625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.9765625,
|
|
"rewards/drgrpo_math_reward/std": 0.15188287198543549,
|
|
"step": 2541
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.572684645652771,
|
|
"advantages/var": 0.3279677033664399,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.3039426523297495,
|
|
"grad_norm": 0.11299043851954678,
|
|
"learning_rate": 3.9618463204774467e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 206492881.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.16203844547271729,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2542
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 5.749358156051495e-09,
|
|
"advantages/std": 0.4049680531024933,
|
|
"advantages/var": 0.16399912403362382,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.306810035842294,
|
|
"grad_norm": 0.09498043700801338,
|
|
"learning_rate": 3.930548747445528e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 206566138.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.09704046696424484,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2543
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.309677419354839,
|
|
"grad_norm": 0.06873867862464882,
|
|
"learning_rate": 3.899372808187506e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 206630974.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 2544
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 7.312544802867383,
|
|
"grad_norm": 0.0839417986922333,
|
|
"learning_rate": 3.868318542175331e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 206717574.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.315412186379929,
|
|
"grad_norm": 0.10666014521366664,
|
|
"learning_rate": 3.8373859887268714e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 206799899.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 2546
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.318279569892473,
|
|
"grad_norm": 0.13026163817783157,
|
|
"learning_rate": 3.8065751870059003e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 206887299.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 2547
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.9752099207640785e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.321146953405018,
|
|
"grad_norm": 0.1197530958272612,
|
|
"learning_rate": 3.775886176022069e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 206963535.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2548
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983438498026945e-09,
|
|
"advantages/std": 0.4675973057746887,
|
|
"advantages/var": 0.21864724036774774,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.324014336917562,
|
|
"grad_norm": 0.0837039966185904,
|
|
"learning_rate": 3.7453189946308195e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 207052943.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.11100947856903076,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 2549
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 5.6917598869923385e-09,
|
|
"advantages/std": 0.5726926326751709,
|
|
"advantages/var": 0.3279768515204182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.326881720430108,
|
|
"grad_norm": 0.11095626885101538,
|
|
"learning_rate": 3.714873681533315e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 207147219.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.1712273508310318,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.2524623174213196e-09,
|
|
"advantages/std": 0.5726876854896545,
|
|
"advantages/var": 0.3279711851114975,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.329749103942652,
|
|
"grad_norm": 0.15933745156749995,
|
|
"learning_rate": 3.6845502752764544e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 207226913.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.1633366346359253,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 2551
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675973057746887,
|
|
"advantages/var": 0.21864724036774774,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.332616487455197,
|
|
"grad_norm": 0.08685867984371011,
|
|
"learning_rate": 3.6543488142527615e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 207307112.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.11100948601961136,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2552
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.335483870967742,
|
|
"grad_norm": 0.051231125537430956,
|
|
"learning_rate": 3.624269336700436e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 207400641.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 2553
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016396905864186e-09,
|
|
"advantages/std": 0.5227974057197571,
|
|
"advantages/var": 0.2733171274273083,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.338351254480287,
|
|
"grad_norm": 0.10042135568757057,
|
|
"learning_rate": 3.5943118807031046e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 207480069.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.14465448260307312,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2554
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.3412186379928315,
|
|
"grad_norm": 0.15793484797984386,
|
|
"learning_rate": 3.5644764841900156e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 207557204.0,
|
|
"reward": 0.71875,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.71875,
|
|
"rewards/drgrpo_math_reward/std": 0.4513758420944214,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.344086021505376,
|
|
"grad_norm": 0.08624708986100145,
|
|
"learning_rate": 3.5347631849358514e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 207641548.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2556
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.3469534050179215,
|
|
"grad_norm": 0.06056629963832414,
|
|
"learning_rate": 3.505172020560687e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 207717287.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2557
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 5.691702426092771e-09,
|
|
"advantages/std": 0.5726984143257141,
|
|
"advantages/var": 0.3279834737711873,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.349820788530466,
|
|
"grad_norm": 0.13642803320793265,
|
|
"learning_rate": 3.4757030285299524e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 207799916.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.17700131237506866,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 2558
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.352688172043011,
|
|
"grad_norm": 0.04221746753613528,
|
|
"learning_rate": 3.4463562461544246e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 207884254.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2559
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.355555555555555,
|
|
"grad_norm": 0.24809742864211887,
|
|
"learning_rate": 3.4171317105901486e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 207965121.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199317639730369e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.358422939068101,
|
|
"grad_norm": 0.12351034662831538,
|
|
"learning_rate": 3.388029458838359e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 208038743.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2561
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 7.361290322580645,
|
|
"grad_norm": 0.06414865538451837,
|
|
"learning_rate": 3.3590495277455165e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 208120922.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2562
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.36415770609319,
|
|
"grad_norm": 0.06516190830643677,
|
|
"learning_rate": 3.3301919540031586e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 208192020.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2563
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9875832530345343e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.367025089605734,
|
|
"grad_norm": 0.0795032768026142,
|
|
"learning_rate": 3.301456774147959e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 208277136.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2564
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958906628562059e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.36989247311828,
|
|
"grad_norm": 0.11348462552313462,
|
|
"learning_rate": 3.272844024561572e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 208345186.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.505166341645741e-09,
|
|
"advantages/std": 0.5726664066314697,
|
|
"advantages/var": 0.32794681328419983,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.372759856630824,
|
|
"grad_norm": 0.1530698628692608,
|
|
"learning_rate": 3.244353741470707e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 208419884.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.13941732048988342,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2566
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975343977137235e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 7.375627240143369,
|
|
"grad_norm": 0.18265269781254576,
|
|
"learning_rate": 3.215985960946943e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 208496416.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2567
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.967257841033182e-09,
|
|
"advantages/std": 0.46757495403289795,
|
|
"advantages/var": 0.21862633763886663,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.378494623655914,
|
|
"grad_norm": 0.10570541480773055,
|
|
"learning_rate": 3.18774071890684e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 208578537.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2568
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.381362007168459,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 3.1596180511117235e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 208648020.0,
|
|
"reward": 1.0,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 1.0,
|
|
"rewards/drgrpo_math_reward/std": 0.0,
|
|
"step": 2569
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 4.0655470077588484e-09,
|
|
"advantages/std": 0.5726920366287231,
|
|
"advantages/var": 0.32797616881795477,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.3842293906810035,
|
|
"grad_norm": 0.1392555252981525,
|
|
"learning_rate": 3.1316179931678235e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 208736732.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.1701665222644806,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 7.041475135887903e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.387096774193548,
|
|
"grad_norm": 0.0684602925732368,
|
|
"learning_rate": 3.10374058052606e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 208817380.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2571
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.0112061672975522e-09,
|
|
"advantages/std": 0.6185711026191711,
|
|
"advantages/var": 0.38263020899549716,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.3899641577060935,
|
|
"grad_norm": 0.21612746472662595,
|
|
"learning_rate": 3.075985848482077e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 208895255.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.18884865939617157,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2572
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.392831541218638,
|
|
"grad_norm": 0.10069243812865293,
|
|
"learning_rate": 3.04835383217622e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 208982394.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 2573
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 7.395698924731183,
|
|
"grad_norm": 0.07040539274525603,
|
|
"learning_rate": 3.0208445665934836e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 209064964.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2574
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.2946288161655503e-08,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.398566308243727,
|
|
"grad_norm": 0.10356970342878828,
|
|
"learning_rate": 2.993458086563405e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 209141967.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.1054728701710701,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 7.966544250856589e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.401433691756273,
|
|
"grad_norm": 0.05903156545806232,
|
|
"learning_rate": 2.9661944267600492e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 209225654.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2576
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.404301075268817,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 2.9390536217020147e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 209296094.0,
|
|
"reward": 1.0,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 1.0,
|
|
"rewards/drgrpo_math_reward/std": 0.0,
|
|
"step": 2577
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.407168458781362,
|
|
"grad_norm": 0.03914098693528715,
|
|
"learning_rate": 2.912035705752369e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 209370141.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2578
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 4.5995914574054795e-09,
|
|
"advantages/std": 0.4049588143825531,
|
|
"advantages/var": 0.1639916413461231,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 7.410035842293907,
|
|
"grad_norm": 0.09784656376581713,
|
|
"learning_rate": 2.88514071311855e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 209452454.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.0867956355214119,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 2579
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344527836563254e-09,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 7.412903225806452,
|
|
"grad_norm": 0.13681151495552854,
|
|
"learning_rate": 2.858368677852352e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 209547865.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.1173202320933342,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"advantages/mean": -6.05359673500061e-09,
|
|
"advantages/snr": 1.494855892459685e-08,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.415770609318996,
|
|
"grad_norm": 0.09461766747123089,
|
|
"learning_rate": 2.8317196338499493e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 209626804.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2581
|
|
},
|
|
{
|
|
"advantages/mean": -7.450580596923828e-09,
|
|
"advantages/snr": 1.2044967427438685e-08,
|
|
"advantages/std": 0.6185637712478638,
|
|
"advantages/var": 0.38262113910037954,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.418637992831541,
|
|
"grad_norm": 0.12677449799390703,
|
|
"learning_rate": 2.805193614851742e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 209712096.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.17965976893901825,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2582
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.1950488391937882e-08,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.421505376344086,
|
|
"grad_norm": 0.10649650202454893,
|
|
"learning_rate": 2.7787906544424088e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 209793105.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2583
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299846843318419e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.424372759856631,
|
|
"grad_norm": 0.12365945314753753,
|
|
"learning_rate": 2.7525107860507767e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 209868130.0,
|
|
"reward": 0.7421875,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.7421875,
|
|
"rewards/drgrpo_math_reward/std": 0.43914902210235596,
|
|
"step": 2584
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 7.966544250856589e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.4272401433691755,
|
|
"grad_norm": 0.04767377163044936,
|
|
"learning_rate": 2.7263540429498744e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 209949629.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983496130645961e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.43010752688172,
|
|
"grad_norm": 0.1227084083377906,
|
|
"learning_rate": 2.700320458256833e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 210033480.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.10205792635679245,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2586
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.149903618877876e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.4329749103942655,
|
|
"grad_norm": 0.09206992125443138,
|
|
"learning_rate": 2.6744100649327973e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 210112615.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2587
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.43584229390681,
|
|
"grad_norm": 0.06789904170480178,
|
|
"learning_rate": 2.6486228957830147e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 210185126.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2588
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907665222004876e-09,
|
|
"advantages/std": 0.5227646827697754,
|
|
"advantages/var": 0.2732829135513839,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.438709677419355,
|
|
"grad_norm": 0.09063198803444496,
|
|
"learning_rate": 2.62295898345668e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 210262699.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.11048543453216553,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2589
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344329800322181e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.4415770609319,
|
|
"grad_norm": 0.18881555002510433,
|
|
"learning_rate": 2.5974183604469347e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 210342850.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917480653229804e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.444444444444445,
|
|
"grad_norm": 0.14514947640500095,
|
|
"learning_rate": 2.5720010590908115e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 210419831.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2591
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.447311827956989,
|
|
"grad_norm": 0.08303006583482903,
|
|
"learning_rate": 2.546707111569235e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 210495603.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2592
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016504754270957e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.450179211469534,
|
|
"grad_norm": 0.11073366251997893,
|
|
"learning_rate": 2.5215365499069442e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 210581943.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2593
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.453046594982079,
|
|
"grad_norm": 0.07828569838957963,
|
|
"learning_rate": 2.496489405972435e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 210664161.0,
|
|
"reward": 0.6640625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.6640625,
|
|
"rewards/drgrpo_math_reward/std": 0.47417303919792175,
|
|
"step": 2594
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.455913978494624,
|
|
"grad_norm": 0.059419224218842485,
|
|
"learning_rate": 2.471565711477952e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 210726960.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.458781362007168,
|
|
"grad_norm": 0.0727835119606027,
|
|
"learning_rate": 2.4467654979794638e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 210798391.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2596
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.461648745519713,
|
|
"grad_norm": 0.08160964499162693,
|
|
"learning_rate": 2.4220887968765868e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 210873675.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2597
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 3.449634187350785e-09,
|
|
"advantages/std": 0.4049657881259918,
|
|
"advantages/var": 0.1639972895525057,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 7.464516129032258,
|
|
"grad_norm": 0.1949232815311928,
|
|
"learning_rate": 2.397535639412551e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 210960638.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.094686359167099,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2598
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 9.958683201273463e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.467383512544803,
|
|
"grad_norm": 0.11237903132808232,
|
|
"learning_rate": 2.3731060566741455e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 211029634.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2599
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975343977137235e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.4702508960573475,
|
|
"grad_norm": 0.12123751878436803,
|
|
"learning_rate": 2.3488000795917505e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 211107753.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.473118279569892,
|
|
"grad_norm": 0.02415472199030343,
|
|
"learning_rate": 2.3246177389392384e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 211183526.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2601
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.2674839170101503e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.4759856630824375,
|
|
"grad_norm": 0.09135367348034021,
|
|
"learning_rate": 2.3005590653338958e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 211249316.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2602
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.478853046594982,
|
|
"grad_norm": 0.06598533088634849,
|
|
"learning_rate": 2.2766240892365006e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 211330585.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 2603
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.330655038356781,
|
|
"advantages/var": 0.10933275439072432,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.481720430107527,
|
|
"grad_norm": 0.09261287130862039,
|
|
"learning_rate": 2.2528128409511792e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 211408453.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2604
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 7.484587813620072,
|
|
"grad_norm": 0.06230076952933386,
|
|
"learning_rate": 2.2291253506253936e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 211484834.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975144418133046e-09,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.487455197132617,
|
|
"grad_norm": 0.07346304914257873,
|
|
"learning_rate": 2.205561648249943e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 211566520.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2606
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966756148857264e-09,
|
|
"advantages/std": 0.467604398727417,
|
|
"advantages/var": 0.21865387370922917,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.490322580645161,
|
|
"grad_norm": 0.11220421222180983,
|
|
"learning_rate": 2.1821217636589174e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 211644505.0,
|
|
"reward": 0.671875,
|
|
"reward_std": 0.11678344011306763,
|
|
"rewards/drgrpo_math_reward/mean": 0.671875,
|
|
"rewards/drgrpo_math_reward/std": 0.4713755249977112,
|
|
"step": 2607
|
|
},
|
|
{
|
|
"advantages/mean": 6.05359673500061e-09,
|
|
"advantages/snr": 1.2945884681216828e-08,
|
|
"advantages/std": 0.4676077961921692,
|
|
"advantages/var": 0.21865705105969724,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.493189964157706,
|
|
"grad_norm": 0.1079590307864344,
|
|
"learning_rate": 2.1588057265295778e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 211732485.0,
|
|
"reward": 0.703125,
|
|
"reward_std": 0.12125921249389648,
|
|
"rewards/drgrpo_math_reward/mean": 0.703125,
|
|
"rewards/drgrpo_math_reward/std": 0.45867621898651123,
|
|
"step": 2608
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.149914704966296e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.496057347670251,
|
|
"grad_norm": 0.07966709391206533,
|
|
"learning_rate": 2.1356135663824326e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 211813730.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2609
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983473280509385e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.498924731182796,
|
|
"grad_norm": 0.10580857336392352,
|
|
"learning_rate": 2.1125453125811376e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 211887361.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 8.13132843852049e-10,
|
|
"advantages/std": 0.5726755261421204,
|
|
"advantages/var": 0.3279572582421544,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.828125,
|
|
"epoch": 7.50179211469534,
|
|
"grad_norm": 0.13645734666840212,
|
|
"learning_rate": 2.0896009943324632e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 211984432.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.15072788298130035,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2611
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.9876049603820392e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.504659498207886,
|
|
"grad_norm": 0.08790942342493094,
|
|
"learning_rate": 2.066780640686272e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 212073616.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2612
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917668534980524e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.50752688172043,
|
|
"grad_norm": 0.09131229022780211,
|
|
"learning_rate": 2.044084280535452e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 212152939.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2613
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.510394265232975,
|
|
"grad_norm": 0.08429131273522639,
|
|
"learning_rate": 2.02151194261595e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 212219148.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2614
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.4497441148988883e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.5132616487455195,
|
|
"grad_norm": 0.10716352857812375,
|
|
"learning_rate": 1.9990636555066497e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 212301104.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5629027821475993e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.516129032258064,
|
|
"grad_norm": 0.08845085702452267,
|
|
"learning_rate": 1.976739447629383e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 212395020.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2616
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 3.983533706996105e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.5189964157706095,
|
|
"grad_norm": 0.07328193602838753,
|
|
"learning_rate": 1.9545393472488736e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 212471284.0,
|
|
"reward": 0.6328125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.6328125,
|
|
"rewards/drgrpo_math_reward/std": 0.4839322865009308,
|
|
"step": 2617
|
|
},
|
|
{
|
|
"advantages/mean": 3.725290298461914e-09,
|
|
"advantages/snr": 6.504827144745217e-09,
|
|
"advantages/std": 0.5726962685585022,
|
|
"advantages/var": 0.3279810160208321,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.521863799283154,
|
|
"grad_norm": 0.13204499760923247,
|
|
"learning_rate": 1.9324633824727266e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 212552830.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.17358636856079102,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2618
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 7.524731182795699,
|
|
"grad_norm": 0.018722105156220862,
|
|
"learning_rate": 1.910511581251406e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 212645788.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2619
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 9.958237646101366e-10,
|
|
"advantages/std": 0.46761414408683777,
|
|
"advantages/var": 0.21866298775006587,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.527598566308244,
|
|
"grad_norm": 0.08477950311774352,
|
|
"learning_rate": 1.8886839713781133e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 212728777.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.12597234547138214,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.563018557708836e-09,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.530465949820789,
|
|
"grad_norm": 0.15156927480755558,
|
|
"learning_rate": 1.8669805804888418e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 212811023.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.1173202320933342,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2621
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899641578136434e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.533333333333333,
|
|
"grad_norm": 0.15132867500073152,
|
|
"learning_rate": 1.8454014360623217e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 212878184.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2622
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 4.878747807970186e-09,
|
|
"advantages/std": 0.5726813077926636,
|
|
"advantages/var": 0.3279638802951155,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.536200716845878,
|
|
"grad_norm": 0.1360717886896722,
|
|
"learning_rate": 1.8239465654199648e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 212957693.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.15650184452533722,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2623
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.539068100358423,
|
|
"grad_norm": 0.031033377471276283,
|
|
"learning_rate": 1.8026159957258092e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 213032690.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2624
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.2649061754629257e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.541935483870968,
|
|
"grad_norm": 0.10799227920738447,
|
|
"learning_rate": 1.7814097539865625e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 213107605.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.544802867383512,
|
|
"grad_norm": 0.07755664155455574,
|
|
"learning_rate": 1.7603278670515144e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 213178410.0,
|
|
"reward": 0.9765625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9765625,
|
|
"rewards/drgrpo_math_reward/std": 0.15188287198543549,
|
|
"step": 2626
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.547670250896058,
|
|
"grad_norm": 0.0730335684037607,
|
|
"learning_rate": 1.73937036161248e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 213258863.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2627
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.550537634408602,
|
|
"grad_norm": 0.059015984902569284,
|
|
"learning_rate": 1.718537264203801e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 213330496.0,
|
|
"reward": 0.9921875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9921875,
|
|
"rewards/drgrpo_math_reward/std": 0.0883883461356163,
|
|
"step": 2628
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5629027821475993e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.553405017921147,
|
|
"grad_norm": 0.11231311667042682,
|
|
"learning_rate": 1.6978286012023222e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 213421396.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2629
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 6.9709613904608755e-09,
|
|
"advantages/std": 0.46760106086730957,
|
|
"advantages/var": 0.21865075212423335,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.5562724014336915,
|
|
"grad_norm": 0.11214757300032535,
|
|
"learning_rate": 1.6772443988273377e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 213507711.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 5.7495735248314805e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.559139784946236,
|
|
"grad_norm": 0.09602851981784188,
|
|
"learning_rate": 1.6567846831405663e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 213590115.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 2631
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 7.5620071684587815,
|
|
"grad_norm": 0.12431809933411388,
|
|
"learning_rate": 1.636449480046076e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 213671775.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2632
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.564874551971326,
|
|
"grad_norm": 0.06941940624352973,
|
|
"learning_rate": 1.6162388152903493e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 213750337.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2633
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.567741935483871,
|
|
"grad_norm": 0.08651216510041282,
|
|
"learning_rate": 1.5961527144621402e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 213830135.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2634
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1498891480459116e-08,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.570609318996416,
|
|
"grad_norm": 0.07142790297650317,
|
|
"learning_rate": 1.5761912029925384e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 213900475.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633244590331673e-09,
|
|
"advantages/std": 0.33065226674079895,
|
|
"advantages/var": 0.10933092150082846,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.573476702508961,
|
|
"grad_norm": 0.04969763682950268,
|
|
"learning_rate": 1.5563543061548166e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 213977629.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.06233953312039375,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 2636
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 8.907216333870301e-10,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.576344086021505,
|
|
"grad_norm": 0.14655240702803526,
|
|
"learning_rate": 1.536642049064574e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 214067352.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 2637
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998470125758874e-09,
|
|
"advantages/std": 0.40494978427886963,
|
|
"advantages/var": 0.16398432778750305,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.57921146953405,
|
|
"grad_norm": 0.08783129596597063,
|
|
"learning_rate": 1.5170544566795006e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 214149402.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2638
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.449667444137735e-09,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.582078853046595,
|
|
"grad_norm": 0.10881172440004862,
|
|
"learning_rate": 1.4975915537995267e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 214228826.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2639
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.58494623655914,
|
|
"grad_norm": 0.044457005489313625,
|
|
"learning_rate": 1.478253365066673e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 214300863.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299897960206267e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.587813620071684,
|
|
"grad_norm": 0.06460679066180552,
|
|
"learning_rate": 1.4590399149650767e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 214377799.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2641
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 4.516862785289507e-09,
|
|
"advantages/std": 0.6185637712478638,
|
|
"advantages/var": 0.38262113910037954,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 7.59068100358423,
|
|
"grad_norm": 0.1520851435633976,
|
|
"learning_rate": 1.4399512278209124e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 214463776.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.17965975403785706,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2642
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899540529955257e-09,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.593548387096774,
|
|
"grad_norm": 0.10438232518063313,
|
|
"learning_rate": 1.4209873278024475e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 214554856.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2643
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.596415770609319,
|
|
"grad_norm": 0.12431999646667162,
|
|
"learning_rate": 1.402148238919898e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 214642966.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2644
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 5.749358156051495e-09,
|
|
"advantages/std": 0.4049680531024933,
|
|
"advantages/var": 0.16399912403362382,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.5992831541218635,
|
|
"grad_norm": 0.08325927045210327,
|
|
"learning_rate": 1.383433985025495e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 214717771.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.09704046696424484,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.602150537634409,
|
|
"grad_norm": 0.11131178063694702,
|
|
"learning_rate": 1.3648445898133964e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 214804945.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2646
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.1950220288145723e-08,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.6050179211469535,
|
|
"grad_norm": 0.1255617216792842,
|
|
"learning_rate": 1.3463800768196864e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 214882079.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2647
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344527836563254e-09,
|
|
"advantages/std": 0.5227716565132141,
|
|
"advantages/var": 0.2732902048535699,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.607885304659498,
|
|
"grad_norm": 0.1825647268533084,
|
|
"learning_rate": 1.3280404694223313e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 214961601.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.1173202246427536,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2648
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599796258942519e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.610752688172043,
|
|
"grad_norm": 0.10938545343611937,
|
|
"learning_rate": 1.309825790841146e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 215050605.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2649
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.613620071684588,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.2917360641377827e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 215118426.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 9.958849501312727e-10,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.616487455197133,
|
|
"grad_norm": 0.11817853305625323,
|
|
"learning_rate": 1.273771312215699e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 215196015.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2651
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.619354838709677,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.2559315578201223e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 215271289.0,
|
|
"reward": 1.0,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 1.0,
|
|
"rewards/drgrpo_math_reward/std": 0.0,
|
|
"step": 2652
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599796258942519e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.622222222222222,
|
|
"grad_norm": 0.1426115443118561,
|
|
"learning_rate": 1.2382168235379743e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 215341639.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2653
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.625089605734767,
|
|
"grad_norm": 7.876231650309869,
|
|
"learning_rate": 1.2206271317979577e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 215420069.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2654
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.627956989247312,
|
|
"grad_norm": 0.07476541749539088,
|
|
"learning_rate": 1.203162504870414e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 215496886.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.299880526045478e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.630824372759856,
|
|
"grad_norm": 0.13475767375141265,
|
|
"learning_rate": 1.1858229648673446e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 215578056.0,
|
|
"reward": 0.8828125,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8828125,
|
|
"rewards/drgrpo_math_reward/std": 0.322907418012619,
|
|
"step": 2656
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125782003796406e-09,
|
|
"advantages/std": 0.5227903723716736,
|
|
"advantages/var": 0.2733097734445131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 7.633691756272402,
|
|
"grad_norm": 0.1014995301777789,
|
|
"learning_rate": 1.168608533742399e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 215665608.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.13781970739364624,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2657
|
|
},
|
|
{
|
|
"advantages/mean": -6.51925802230835e-09,
|
|
"advantages/snr": 1.1383733037079748e-08,
|
|
"advantages/std": 0.5726819038391113,
|
|
"advantages/var": 0.32796456298478915,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.636559139784946,
|
|
"grad_norm": 0.12712537844489796,
|
|
"learning_rate": 1.1515192332907875e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 215742089.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.15756267309188843,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2658
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199591840825068e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.639426523297491,
|
|
"grad_norm": 0.11537309470133952,
|
|
"learning_rate": 1.1345550851493469e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 215819150.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 2659
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983562397524497e-09,
|
|
"advantages/std": 0.4675827622413635,
|
|
"advantages/var": 0.2186336395452635,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.6422939068100355,
|
|
"grad_norm": 0.09980575467717229,
|
|
"learning_rate": 1.1177161107964184e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 215898070.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.63344972278721e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.645161290322581,
|
|
"grad_norm": 0.08570709282552061,
|
|
"learning_rate": 1.1010023315518591e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 215979536.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 2661
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 7.6480286738351255,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.0844137685770194e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 216047337.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2662
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344503462080032e-09,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.65089605734767,
|
|
"grad_norm": 0.13974908876271358,
|
|
"learning_rate": 1.0679504428747543e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 216133284.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.12073517590761185,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 2663
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.63344972278721e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.6537634408602155,
|
|
"grad_norm": 0.06524778747399308,
|
|
"learning_rate": 1.0516123752893013e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 216209635.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2664
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 5.749701315113695e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.65663082437276,
|
|
"grad_norm": 0.1392669636215334,
|
|
"learning_rate": 1.0353995865063137e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 216289854.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.659498207885305,
|
|
"grad_norm": 0.07184046073763721,
|
|
"learning_rate": 1.0193120970528602e-08,
|
|
"loss": -0.0,
|
|
"num_tokens": 216380214.0,
|
|
"reward": 0.734375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.734375,
|
|
"rewards/drgrpo_math_reward/std": 0.44340085983276367,
|
|
"step": 2666
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 7.662365591397849,
|
|
"grad_norm": 0.08833598069209586,
|
|
"learning_rate": 1.00334992729737e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 216461696.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2667
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 1.3798774372216438e-08,
|
|
"advantages/std": 0.4049588143825531,
|
|
"advantages/var": 0.1639916413461231,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.665232974910394,
|
|
"grad_norm": 0.08168051975988941,
|
|
"learning_rate": 9.87513097449555e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 216543066.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.0867956355214119,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2668
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.9752099207640785e-09,
|
|
"advantages/std": 0.4675932228565216,
|
|
"advantages/var": 0.21864342206134868,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 7.668100358422939,
|
|
"grad_norm": 0.10562166931093338,
|
|
"learning_rate": 9.718016275604756e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 216632354.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.1054728776216507,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 2669
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.9835626514248234e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.670967741935484,
|
|
"grad_norm": 0.17415648960053537,
|
|
"learning_rate": 9.562155375224756e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 216714101.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.8996420859237135e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.673835125448028,
|
|
"grad_norm": 0.13275102999138666,
|
|
"learning_rate": 9.407548470691251e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 216789143.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2671
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.2674839170101503e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.676702508960574,
|
|
"grad_norm": 0.09636463428576127,
|
|
"learning_rate": 9.254195757752547e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 216869537.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2672
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.679569892473118,
|
|
"grad_norm": 0.0389602525214859,
|
|
"learning_rate": 9.102097430568889e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 216937156.0,
|
|
"reward": 0.9921875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9921875,
|
|
"rewards/drgrpo_math_reward/std": 0.0883883461356163,
|
|
"step": 2673
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.682437275985663,
|
|
"grad_norm": 0.10580992641584908,
|
|
"learning_rate": 8.951253681712234e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 217018393.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2674
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 6.022598620254011e-09,
|
|
"advantages/std": 0.6185519695281982,
|
|
"advantages/var": 0.3826065390072131,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.6853046594982075,
|
|
"grad_norm": 0.13720080016998606,
|
|
"learning_rate": 8.801664702166367e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 217102988.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.1659901738166809,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.688172043010753,
|
|
"grad_norm": 0.07001202663451297,
|
|
"learning_rate": 8.653330681326232e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 217180704.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2676
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 2.6720844723812904e-09,
|
|
"advantages/std": 0.5228067636489868,
|
|
"advantages/var": 0.27332691211712756,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.6910394265232975,
|
|
"grad_norm": 0.17707506611321436,
|
|
"learning_rate": 8.506251806997932e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 217257453.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.15490421652793884,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2677
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199591840825068e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.693906810035842,
|
|
"grad_norm": 0.09510770528265325,
|
|
"learning_rate": 8.3604282653984e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 217334152.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2678
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814751550759118e-09,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.6967741935483875,
|
|
"grad_norm": 0.1117515767481919,
|
|
"learning_rate": 8.215860241155058e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 217427153.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2679
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 8.907256955369e-09,
|
|
"advantages/std": 0.5227886438369751,
|
|
"advantages/var": 0.2733079661249036,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 7.699641577060932,
|
|
"grad_norm": 0.09211896656131062,
|
|
"learning_rate": 8.072547917305939e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 217508592.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.1354655921459198,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 7.702508960573477,
|
|
"grad_norm": 0.07907253225985557,
|
|
"learning_rate": 7.930491475299227e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 217598379.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2681
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.8996420859237135e-09,
|
|
"advantages/std": 0.4049438536167145,
|
|
"advantages/var": 0.16397952458195508,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.705376344086021,
|
|
"grad_norm": 0.09680865606183092,
|
|
"learning_rate": 7.789691094992834e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 217665828.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2682
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.708243727598567,
|
|
"grad_norm": 0.07900924007013105,
|
|
"learning_rate": 7.650146954654491e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 217735925.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 2683
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.4082993428404723e-08,
|
|
"advantages/std": 0.330655038356781,
|
|
"advantages/var": 0.10933275439072432,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.711111111111111,
|
|
"grad_norm": 0.08300974944848195,
|
|
"learning_rate": 7.511859230961315e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 217814333.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2684
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.0954489382432772e-08,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.713978494623656,
|
|
"grad_norm": 0.08399229608988694,
|
|
"learning_rate": 7.37482809900003e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 217885789.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.7168458781362,
|
|
"grad_norm": 0.06226440592484084,
|
|
"learning_rate": 7.239053732265743e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 217958104.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2686
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633180108710322e-09,
|
|
"advantages/std": 0.3306560516357422,
|
|
"advantages/var": 0.1093334244833386,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.719713261648746,
|
|
"grad_norm": 0.1327203335786122,
|
|
"learning_rate": 7.104536302662833e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 218031770.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.0657544732093811,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2687
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 8.049325332145131e-09,
|
|
"advantages/std": 0.40495678782463074,
|
|
"advantages/var": 0.163990000005243,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.72258064516129,
|
|
"grad_norm": 0.08045891667395001,
|
|
"learning_rate": 6.971275980504176e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 218115744.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.08785156160593033,
|
|
"rewards/drgrpo_math_reward/mean": 0.75,
|
|
"rewards/drgrpo_math_reward/std": 0.434714138507843,
|
|
"step": 2688
|
|
},
|
|
{
|
|
"advantages/mean": 4.190951585769653e-09,
|
|
"advantages/snr": 8.016351208262037e-09,
|
|
"advantages/std": 0.5228003859519958,
|
|
"advantages/var": 0.2733202435515558,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.734375,
|
|
"epoch": 7.725448028673835,
|
|
"grad_norm": 0.20407715647000374,
|
|
"learning_rate": 6.8392729345111425e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 218200423.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.14913025498390198,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2689
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.5629072505384383e-09,
|
|
"advantages/std": 0.5227879881858826,
|
|
"advantages/var": 0.2733072805914425,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.7283154121863795,
|
|
"grad_norm": 0.11691903434726374,
|
|
"learning_rate": 6.7085273318128185e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 218278595.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.1344047486782074,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.731182795698925,
|
|
"grad_norm": 0.059861063471256346,
|
|
"learning_rate": 6.5790393379467905e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 218361352.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2691
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.9835626514248234e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.7340501792114695,
|
|
"grad_norm": 0.09439922768306205,
|
|
"learning_rate": 6.450809116858136e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 218444512.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2692
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125678014490734e-09,
|
|
"advantages/std": 0.5227980017662048,
|
|
"advantages/var": 0.2733177506507367,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.736917562724014,
|
|
"grad_norm": 0.12924627370163774,
|
|
"learning_rate": 6.32383683089932e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 218528346.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.14571532607078552,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2693
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814379860189666e-09,
|
|
"advantages/std": 0.5227925777435303,
|
|
"advantages/var": 0.27331207934372515,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.7397849462365595,
|
|
"grad_norm": 0.16310779641981915,
|
|
"learning_rate": 6.1981226408303056e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 218613416.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.13782459497451782,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2694
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.742652329749104,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 6.073666705818104e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 218688676.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 6.971027590413914e-09,
|
|
"advantages/std": 0.4675966203212738,
|
|
"advantages/var": 0.2186465993358775,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.745519713261649,
|
|
"grad_norm": 0.09387237257468709,
|
|
"learning_rate": 5.9504691834368905e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 218784615.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.10994865000247955,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2696
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.4082993428404723e-08,
|
|
"advantages/std": 0.330655038356781,
|
|
"advantages/var": 0.10933275439072432,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.748387096774193,
|
|
"grad_norm": 0.07097908960534689,
|
|
"learning_rate": 5.828530229667228e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 218849472.0,
|
|
"reward": 0.953125,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/drgrpo_math_reward/mean": 0.953125,
|
|
"rewards/drgrpo_math_reward/std": 0.21220162510871887,
|
|
"step": 2697
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1499234216592094e-08,
|
|
"advantages/std": 0.404949814081192,
|
|
"advantages/var": 0.16398435192439198,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.751254480286739,
|
|
"grad_norm": 0.1148408687090542,
|
|
"learning_rate": 5.7078499988961745e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 218925077.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2698
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.5227786302566528,
|
|
"advantages/var": 0.27329749625302213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.754121863799283,
|
|
"grad_norm": 0.14558999467538067,
|
|
"learning_rate": 5.588428643917509e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 219011215.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.12415502220392227,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2699
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.756989247311828,
|
|
"grad_norm": 0.05451999267912209,
|
|
"learning_rate": 5.4702663159308385e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 219088852.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167162292944866e-09,
|
|
"advantages/std": 0.3306412398815155,
|
|
"advantages/var": 0.10932362951038588,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.759856630824372,
|
|
"grad_norm": 0.0705969087857691,
|
|
"learning_rate": 5.353363164541824e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 219173993.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.05550473928451538,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 2701
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.762724014336918,
|
|
"grad_norm": 0.11894700660589524,
|
|
"learning_rate": 5.2377193377617365e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 219249412.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2702
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.765591397849462,
|
|
"grad_norm": 0.11047274624979665,
|
|
"learning_rate": 5.123334982007566e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 219323901.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2703
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 3.4497441148988883e-09,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.768458781362007,
|
|
"grad_norm": 0.11190909380506751,
|
|
"learning_rate": 5.0102102421016865e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 219407707.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2704
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.7713261648745515,
|
|
"grad_norm": 0.040562304752599074,
|
|
"learning_rate": 4.8983452612715306e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 219471542.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.125773067096241e-09,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.774193548387097,
|
|
"grad_norm": 0.14838139256253263,
|
|
"learning_rate": 4.78774018114958e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 219562303.0,
|
|
"reward": 0.7265625,
|
|
"reward_std": 0.13888053596019745,
|
|
"rewards/drgrpo_math_reward/mean": 0.7265625,
|
|
"rewards/drgrpo_math_reward/std": 0.447474867105484,
|
|
"step": 2706
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 1.9917811987622486e-09,
|
|
"advantages/std": 0.4675827622413635,
|
|
"advantages/var": 0.2186336395452635,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.7770609318996415,
|
|
"grad_norm": 0.13831066794407268,
|
|
"learning_rate": 4.678395141773373e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 219653895.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.09522313624620438,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 2707
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.975244195968941e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.779928315412186,
|
|
"grad_norm": 0.16448989630108082,
|
|
"learning_rate": 4.570310281584832e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 219723862.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2708
|
|
},
|
|
{
|
|
"advantages/mean": 1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.7827956989247316,
|
|
"grad_norm": 0.0648672440595542,
|
|
"learning_rate": 4.463485737430605e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 219806311.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2709
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299897960206267e-09,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.785663082437276,
|
|
"grad_norm": 0.10196929770514401,
|
|
"learning_rate": 4.35792164456128e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 219883002.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.788530465949821,
|
|
"grad_norm": 0.06411780572392889,
|
|
"learning_rate": 4.253618136631943e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 219954027.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2711
|
|
},
|
|
{
|
|
"advantages/mean": 9.313225746154785e-10,
|
|
"advantages/snr": 2.8167571052905777e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 7.791397849462365,
|
|
"grad_norm": 0.0983928491598763,
|
|
"learning_rate": 4.1505753457016235e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 220026131.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2712
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 7.52825480422812e-09,
|
|
"advantages/std": 0.6185514330863953,
|
|
"advantages/var": 0.3826058753732333,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.794265232974911,
|
|
"grad_norm": 0.1578133285862227,
|
|
"learning_rate": 4.048793402232853e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 220111872.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.1649293452501297,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2713
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.40494081377983093,
|
|
"advantages/var": 0.1639770626646717,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.797132616487455,
|
|
"grad_norm": 0.06988750574363187,
|
|
"learning_rate": 3.948272435092214e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 220184531.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.07312604784965515,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2714
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.8,
|
|
"grad_norm": 0.030883191530758667,
|
|
"learning_rate": 3.849012571549348e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 220263468.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.802867383512545,
|
|
"grad_norm": 0.08513554698850792,
|
|
"learning_rate": 3.751013937277614e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 220333592.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2716
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.2649061754629257e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.80573476702509,
|
|
"grad_norm": 0.12487730211503031,
|
|
"learning_rate": 3.654276656353206e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 220411113.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2717
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.808602150537634,
|
|
"grad_norm": 0.06614608905441266,
|
|
"learning_rate": 3.5588008512555944e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 220484064.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 2718
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.811469534050179,
|
|
"grad_norm": 0.08775363283175698,
|
|
"learning_rate": 3.4645866428667514e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 220558740.0,
|
|
"reward": 0.8046875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.8046875,
|
|
"rewards/drgrpo_math_reward/std": 0.3979988098144531,
|
|
"step": 2719
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.8143369175627235,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 3.371634150471481e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 220636615.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 1.149940263022739e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.817204301075269,
|
|
"grad_norm": 0.16322981571413456,
|
|
"learning_rate": 3.27994349175742e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 220702588.0,
|
|
"reward": 0.9453125,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.9453125,
|
|
"rewards/drgrpo_math_reward/std": 0.22826264798641205,
|
|
"step": 2721
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 9.958626076587386e-10,
|
|
"advantages/std": 0.4675959050655365,
|
|
"advantages/var": 0.21864593043405822,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.8200716845878135,
|
|
"grad_norm": 0.12377264896800805,
|
|
"learning_rate": 3.189514782814151e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 220775512.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2722
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.252482966806137e-09,
|
|
"advantages/std": 0.5726840496063232,
|
|
"advantages/var": 0.3279670206734977,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.822939068100358,
|
|
"grad_norm": 0.12787739498772854,
|
|
"learning_rate": 3.1003481381337572e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 220861917.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.16097760200500488,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 2723
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.198954094593892e-09,
|
|
"advantages/std": 0.40496888756752014,
|
|
"advantages/var": 0.16399979989767477,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.825806451612904,
|
|
"grad_norm": 0.08716448625132536,
|
|
"learning_rate": 3.0124436706102653e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 220944635.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.09810129553079605,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2724
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.828673835125448,
|
|
"grad_norm": 0.167880645885384,
|
|
"learning_rate": 2.925801491539981e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 221009199.0,
|
|
"reward": 0.984375,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.984375,
|
|
"rewards/drgrpo_math_reward/std": 0.12450689822435379,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675827622413635,
|
|
"advantages/var": 0.2186336395452635,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.831541218637993,
|
|
"grad_norm": 0.1561044105138881,
|
|
"learning_rate": 2.840421710620489e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 221083621.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2726
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 6.899847745916856e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.834408602150537,
|
|
"grad_norm": 0.10871773923395783,
|
|
"learning_rate": 2.7563044359514286e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 221164548.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2727
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1499235062879438e-08,
|
|
"advantages/std": 0.40494978427886963,
|
|
"advantages/var": 0.16398432778750305,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.837275985663083,
|
|
"grad_norm": 0.10976454459404987,
|
|
"learning_rate": 2.6734497740340533e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 221240488.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.07996084541082382,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2728
|
|
},
|
|
{
|
|
"advantages/mean": -4.6566128730773926e-09,
|
|
"advantages/snr": 1.1499147049662961e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.840143369175627,
|
|
"grad_norm": 0.08882144801910914,
|
|
"learning_rate": 2.591857829770672e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 221325464.0,
|
|
"reward": 0.765625,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.765625,
|
|
"rewards/drgrpo_math_reward/std": 0.42527204751968384,
|
|
"step": 2729
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 4.2250872920904075e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.843010752688172,
|
|
"grad_norm": 0.05628016315565577,
|
|
"learning_rate": 2.5115287064650934e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 221400026.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975343977137235e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.845878136200717,
|
|
"grad_norm": 0.11938439253889663,
|
|
"learning_rate": 2.432462505822297e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 221481678.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2731
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.848745519713262,
|
|
"grad_norm": 0.04007472771262617,
|
|
"learning_rate": 2.3546593279482053e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 221566319.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 2732
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344599133202926e-09,
|
|
"advantages/std": 0.5227646827697754,
|
|
"advantages/var": 0.2732829135513839,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.851612903225806,
|
|
"grad_norm": 0.10316750023043703,
|
|
"learning_rate": 2.2781192713494656e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 221658000.0,
|
|
"reward": 0.7109375,
|
|
"reward_std": 0.11048543453216553,
|
|
"rewards/drgrpo_math_reward/mean": 0.7109375,
|
|
"rewards/drgrpo_math_reward/std": 0.45510825514793396,
|
|
"step": 2733
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966992261291921e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.854480286738351,
|
|
"grad_norm": 0.09879265918172515,
|
|
"learning_rate": 2.2028424329337827e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 221737115.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2734
|
|
},
|
|
{
|
|
"advantages/mean": -1.3969838619232178e-09,
|
|
"advantages/snr": 2.9875550720364307e-09,
|
|
"advantages/std": 0.4676010310649872,
|
|
"advantages/var": 0.2186507242530391,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.857347670250896,
|
|
"grad_norm": 0.23714559186754167,
|
|
"learning_rate": 2.1288289080092504e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 221822242.0,
|
|
"reward": 0.796875,
|
|
"reward_std": 0.11230766773223877,
|
|
"rewards/drgrpo_math_reward/mean": 0.796875,
|
|
"rewards/drgrpo_math_reward/std": 0.40390563011169434,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.5995914574054795e-09,
|
|
"advantages/std": 0.4049588143825531,
|
|
"advantages/var": 0.1639916413461231,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 7.860215053763441,
|
|
"grad_norm": 0.055693316196279354,
|
|
"learning_rate": 2.056078790284688e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 221911649.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.0867956355214119,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2736
|
|
},
|
|
{
|
|
"advantages/mean": -6.984919309616089e-09,
|
|
"advantages/snr": 1.219668799133311e-08,
|
|
"advantages/std": 0.5726898312568665,
|
|
"advantages/var": 0.3279736428250182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.8630824372759855,
|
|
"grad_norm": 0.15205751802922915,
|
|
"learning_rate": 1.9845921718690816e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 221992707.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.16675157845020294,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2737
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4675983488559723,
|
|
"advantages/var": 0.21864821585283156,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.86594982078853,
|
|
"grad_norm": 0.10385354678090679,
|
|
"learning_rate": 1.914369143272032e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 222077741.0,
|
|
"reward": 0.7578125,
|
|
"reward_std": 0.10889272391796112,
|
|
"rewards/drgrpo_math_reward/mean": 0.7578125,
|
|
"rewards/drgrpo_math_reward/std": 0.4300905168056488,
|
|
"step": 2738
|
|
},
|
|
{
|
|
"advantages/mean": -5.587935447692871e-09,
|
|
"advantages/snr": 9.757858184220564e-09,
|
|
"advantages/std": 0.5726600289344788,
|
|
"advantages/var": 0.32793950873923805,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.868817204301076,
|
|
"grad_norm": 0.184846041114866,
|
|
"learning_rate": 1.8454097934027524e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 222149218.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.13258251547813416,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2739
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.330655038356781,
|
|
"advantages/var": 0.10933275439072432,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.87168458781362,
|
|
"grad_norm": 0.07811272576960604,
|
|
"learning_rate": 1.7777142095711794e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 222238855.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.0646936446428299,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.983539800525091e-09,
|
|
"advantages/std": 0.46758541464805603,
|
|
"advantages/var": 0.2186361199915945,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.874551971326165,
|
|
"grad_norm": 0.11646064019977592,
|
|
"learning_rate": 1.7112824774866419e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 222318739.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.09863808751106262,
|
|
"rewards/drgrpo_math_reward/mean": 0.65625,
|
|
"rewards/drgrpo_math_reward/std": 0.47682511806488037,
|
|
"step": 2741
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.0,
|
|
"advantages/var": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.877419354838709,
|
|
"grad_norm": 0.0,
|
|
"learning_rate": 1.6461146812586368e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 222387888.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.0,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2742
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.880286738351255,
|
|
"grad_norm": 0.07704070029133948,
|
|
"learning_rate": 1.582210903396275e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 222458262.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2743
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199113184367294e-09,
|
|
"advantages/std": 0.40496188402175903,
|
|
"advantages/var": 0.16399412751045261,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.78125,
|
|
"epoch": 7.883154121863799,
|
|
"grad_norm": 0.20750762751742974,
|
|
"learning_rate": 1.5195712248081693e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 222538911.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.09021057933568954,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2744
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.886021505376344,
|
|
"grad_norm": 0.028746765543212663,
|
|
"learning_rate": 1.4581957248026577e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 222614453.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 7.966726021133282e-09,
|
|
"advantages/std": 0.23380307853221893,
|
|
"advantages/var": 0.054663879531142934,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.888888888888889,
|
|
"grad_norm": 0.08040992155965311,
|
|
"learning_rate": 1.3980844810875803e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 222687585.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.0289318785071373,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2746
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 4.453752885066694e-09,
|
|
"advantages/std": 0.5227740406990051,
|
|
"advantages/var": 0.27329269762876507,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.891756272401434,
|
|
"grad_norm": 0.10904768791813681,
|
|
"learning_rate": 1.3392375697696134e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 222768456.0,
|
|
"reward": 0.7734375,
|
|
"reward_std": 0.12073516845703125,
|
|
"rewards/drgrpo_math_reward/mean": 0.7734375,
|
|
"rewards/drgrpo_math_reward/std": 0.4202519655227661,
|
|
"step": 2747
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 5.63344972278721e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.894623655913978,
|
|
"grad_norm": 0.056924319592008936,
|
|
"learning_rate": 1.2816550653551584e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 222847091.0,
|
|
"reward": 0.828125,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.828125,
|
|
"rewards/drgrpo_math_reward/std": 0.3787541687488556,
|
|
"step": 2748
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.7814751550759118e-09,
|
|
"advantages/std": 0.5227816700935364,
|
|
"advantages/var": 0.2733006745857871,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.897491039426523,
|
|
"grad_norm": 0.19184599630160612,
|
|
"learning_rate": 1.2253370407495634e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 222921748.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.12863078713417053,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2749
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 7.966992261291921e-09,
|
|
"advantages/std": 0.4675905406475067,
|
|
"advantages/var": 0.21864091370302763,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.900358422939068,
|
|
"grad_norm": 0.08997440020834355,
|
|
"learning_rate": 1.1702835672572353e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 223008554.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.10205793380737305,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 7.041577316723057e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.903225806451613,
|
|
"grad_norm": 0.11961462196368258,
|
|
"learning_rate": 1.1164947145815285e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 223083316.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2751
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.9060931899641576,
|
|
"grad_norm": 0.07615611470258046,
|
|
"learning_rate": 1.0639705508245222e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 223162187.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 2752
|
|
},
|
|
{
|
|
"advantages/mean": 4.656612873077393e-10,
|
|
"advantages/snr": 8.907216333870301e-10,
|
|
"advantages/std": 0.5227910280227661,
|
|
"advantages/var": 0.2733104589811006,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.890625,
|
|
"epoch": 7.908960573476703,
|
|
"grad_norm": 0.11669161858992876,
|
|
"learning_rate": 1.0127111424872436e-09,
|
|
"loss": -0.0,
|
|
"num_tokens": 223241428.0,
|
|
"reward": 0.8203125,
|
|
"reward_std": 0.13888052105903625,
|
|
"rewards/drgrpo_math_reward/mean": 0.8203125,
|
|
"rewards/drgrpo_math_reward/std": 0.3854354918003082,
|
|
"step": 2753
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.911827956989248,
|
|
"grad_norm": 0.05064682095632566,
|
|
"learning_rate": 9.62716554469445e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 223326818.0,
|
|
"reward": 0.9765625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9765625,
|
|
"rewards/drgrpo_math_reward/std": 0.15188287198543549,
|
|
"step": 2754
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.914695340501792,
|
|
"grad_norm": 0.05404498579841204,
|
|
"learning_rate": 9.139868500693815e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 223399407.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"advantages/mean": 2.7939677238464355e-09,
|
|
"advantages/snr": 5.975176026781512e-09,
|
|
"advantages/std": 0.4675958752632141,
|
|
"advantages/var": 0.2186459025631713,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.917562724014337,
|
|
"grad_norm": 0.08613914332482507,
|
|
"learning_rate": 8.665220909838123e-10,
|
|
"loss": -0.0,
|
|
"num_tokens": 223479721.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.10888782143592834,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2756
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.299949248638952e-09,
|
|
"advantages/std": 0.40493178367614746,
|
|
"advantages/var": 0.16396974943114628,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.920430107526881,
|
|
"grad_norm": 0.13985749059222663,
|
|
"learning_rate": 8.203223373078883e-10,
|
|
"loss": -0.0,
|
|
"num_tokens": 223551462.0,
|
|
"reward": 0.9296875,
|
|
"reward_std": 0.06629125773906708,
|
|
"rewards/drgrpo_math_reward/mean": 0.9296875,
|
|
"rewards/drgrpo_math_reward/std": 0.2566775679588318,
|
|
"step": 2757
|
|
},
|
|
{
|
|
"advantages/mean": 2.3283064365386963e-09,
|
|
"advantages/snr": 9.958180313570735e-09,
|
|
"advantages/std": 0.23380841314792633,
|
|
"advantages/var": 0.05466637405875141,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.9375,
|
|
"epoch": 7.923297491039427,
|
|
"grad_norm": 0.028255021993964898,
|
|
"learning_rate": 7.753876475353749e-10,
|
|
"loss": -0.0,
|
|
"num_tokens": 223625050.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.03234682232141495,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2758
|
|
},
|
|
{
|
|
"advantages/mean": -4.656612873077393e-10,
|
|
"advantages/snr": 1.149905988405531e-09,
|
|
"advantages/std": 0.4049559533596039,
|
|
"advantages/var": 0.16398932416138567,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.926164874551971,
|
|
"grad_norm": 0.09122449924121048,
|
|
"learning_rate": 7.317180785582078e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 223711809.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.08679073303937912,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2759
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 4.599761052090956e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.875,
|
|
"epoch": 7.929032258064516,
|
|
"grad_norm": 0.11140787709211054,
|
|
"learning_rate": 6.893136856664928e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 223790438.0,
|
|
"reward": 0.7890625,
|
|
"reward_std": 0.0765409916639328,
|
|
"rewards/drgrpo_math_reward/mean": 0.7890625,
|
|
"rewards/drgrpo_math_reward/std": 0.4095771610736847,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.931899641577061,
|
|
"grad_norm": 0.09015450308190896,
|
|
"learning_rate": 6.481745225485057e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 223860406.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 2761
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.633702096822814e-09,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.934767025089606,
|
|
"grad_norm": 0.1530075651630246,
|
|
"learning_rate": 6.083006412906932e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 223932568.0,
|
|
"reward": 0.921875,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.921875,
|
|
"rewards/drgrpo_math_reward/std": 0.2694226801395416,
|
|
"step": 2762
|
|
},
|
|
{
|
|
"advantages/mean": -3.259629011154175e-09,
|
|
"advantages/snr": 9.858537014877616e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.93763440860215,
|
|
"grad_norm": 0.07419653303663103,
|
|
"learning_rate": 5.696920923774496e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 224001398.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.96875,
|
|
"rewards/drgrpo_math_reward/std": 0.1746762990951538,
|
|
"step": 2763
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199094228701277e-09,
|
|
"advantages/std": 0.4049627184867859,
|
|
"advantages/var": 0.1639948033642078,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.940501792114695,
|
|
"grad_norm": 0.07543987633431061,
|
|
"learning_rate": 5.323489246911172e-10,
|
|
"loss": -0.0,
|
|
"num_tokens": 224078782.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.09127141535282135,
|
|
"rewards/drgrpo_math_reward/mean": 0.875,
|
|
"rewards/drgrpo_math_reward/std": 0.3320184051990509,
|
|
"step": 2764
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.449892780067669e-09,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.94336917562724,
|
|
"grad_norm": 0.07923271569372832,
|
|
"learning_rate": 4.962711855120982e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 224158577.0,
|
|
"reward": 0.8671875,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8671875,
|
|
"rewards/drgrpo_math_reward/std": 0.3407054841518402,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"advantages/mean": 1.862645149230957e-09,
|
|
"advantages/snr": 5.633244590331673e-09,
|
|
"advantages/std": 0.33065226674079895,
|
|
"advantages/var": 0.10933092150082846,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.946236559139785,
|
|
"grad_norm": 0.07055910941482718,
|
|
"learning_rate": 4.614589205184094e-10,
|
|
"loss": -0.0,
|
|
"num_tokens": 224241620.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.06233953312039375,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2766
|
|
},
|
|
{
|
|
"advantages/mean": -5.122274160385132e-09,
|
|
"advantages/snr": 1.2649061754629257e-08,
|
|
"advantages/std": 0.40495288372039795,
|
|
"advantages/var": 0.16398683803346614,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.94910394265233,
|
|
"grad_norm": 0.07495124993441504,
|
|
"learning_rate": 4.279121737859048e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 224322394.0,
|
|
"reward": 0.890625,
|
|
"reward_std": 0.08337578922510147,
|
|
"rewards/drgrpo_math_reward/mean": 0.890625,
|
|
"rewards/drgrpo_math_reward/std": 0.31333550810813904,
|
|
"step": 2767
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 1.126665165824288e-08,
|
|
"advantages/std": 0.3306474983692169,
|
|
"advantages/var": 0.1093277681778213,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 7.951971326164875,
|
|
"grad_norm": 0.060232234185240006,
|
|
"learning_rate": 3.9563098778827576e-10,
|
|
"loss": -0.0,
|
|
"num_tokens": 224407186.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.0578637570142746,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2768
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 3.2525469477123842e-09,
|
|
"advantages/std": 0.5726727843284607,
|
|
"advantages/var": 0.32795411791051166,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.96875,
|
|
"epoch": 7.95483870967742,
|
|
"grad_norm": 0.14730117650532376,
|
|
"learning_rate": 3.646154033968285e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 224481005.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.1462520956993103,
|
|
"rewards/drgrpo_math_reward/mean": 0.90625,
|
|
"rewards/drgrpo_math_reward/std": 0.29262590408325195,
|
|
"step": 2769
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199592517885038e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.957706093189964,
|
|
"grad_norm": 0.08582205000676567,
|
|
"learning_rate": 3.3486545988048454e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 224565442.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 1.2674839170101503e-08,
|
|
"advantages/std": 0.33065125346183777,
|
|
"advantages/var": 0.10933025141588448,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.953125,
|
|
"epoch": 7.960573476702509,
|
|
"grad_norm": 0.050238328452193,
|
|
"learning_rate": 3.063811949056694e-10,
|
|
"loss": -0.0,
|
|
"num_tokens": 224641292.0,
|
|
"reward": 0.8984375,
|
|
"reward_std": 0.061278700828552246,
|
|
"rewards/drgrpo_math_reward/mean": 0.8984375,
|
|
"rewards/drgrpo_math_reward/std": 0.3032590448856354,
|
|
"step": 2771
|
|
},
|
|
{
|
|
"advantages/mean": 3.259629011154175e-09,
|
|
"advantages/snr": 5.691787729288785e-09,
|
|
"advantages/std": 0.5726898312568665,
|
|
"advantages/var": 0.3279736428250182,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.963440860215054,
|
|
"grad_norm": 0.24453593361327636,
|
|
"learning_rate": 2.791626445364237e-10,
|
|
"loss": -0.0,
|
|
"num_tokens": 224711573.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.16675157845020294,
|
|
"rewards/drgrpo_math_reward/mean": 0.78125,
|
|
"rewards/drgrpo_math_reward/std": 0.41502299904823303,
|
|
"step": 2772
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.344599133202926e-09,
|
|
"advantages/std": 0.5227646827697754,
|
|
"advantages/var": 0.2732829135513839,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.966308243727599,
|
|
"grad_norm": 0.15305083215579046,
|
|
"learning_rate": 2.5320984323418113e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 224785946.0,
|
|
"reward": 0.9609375,
|
|
"reward_std": 0.11048543453216553,
|
|
"rewards/drgrpo_math_reward/mean": 0.9609375,
|
|
"rewards/drgrpo_math_reward/std": 0.194504976272583,
|
|
"step": 2773
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 5.975343977137235e-09,
|
|
"advantages/std": 0.46758273243904114,
|
|
"advantages/var": 0.21863361167515993,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.969175627240143,
|
|
"grad_norm": 0.12863254373966837,
|
|
"learning_rate": 2.2852282385787957e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 224864554.0,
|
|
"reward": 0.8515625,
|
|
"reward_std": 0.09522314369678497,
|
|
"rewards/drgrpo_math_reward/mean": 0.8515625,
|
|
"rewards/drgrpo_math_reward/std": 0.356930136680603,
|
|
"step": 2774
|
|
},
|
|
{
|
|
"advantages/mean": -2.3283064365386963e-09,
|
|
"advantages/snr": 3.764007346427472e-09,
|
|
"advantages/std": 0.6185711622238159,
|
|
"advantages/var": 0.3826302827349224,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"epoch": 7.972043010752688,
|
|
"grad_norm": 0.13836384217215011,
|
|
"learning_rate": 2.051016176637388e-10,
|
|
"loss": -0.0,
|
|
"num_tokens": 224957309.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.18884867429733276,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 1.9917668534980524e-09,
|
|
"advantages/std": 0.46758612990379333,
|
|
"advantages/var": 0.2186367888784071,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.84375,
|
|
"epoch": 7.974910394265233,
|
|
"grad_norm": 0.10442698926745332,
|
|
"learning_rate": 1.8294625430559373e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 225048398.0,
|
|
"reward": 0.6953125,
|
|
"reward_std": 0.09969891607761383,
|
|
"rewards/drgrpo_math_reward/mean": 0.6953125,
|
|
"rewards/drgrpo_math_reward/std": 0.46208351850509644,
|
|
"step": 2776
|
|
},
|
|
{
|
|
"advantages/mean": -3.725290298461914e-09,
|
|
"advantages/snr": 9.199522104181912e-09,
|
|
"advantages/std": 0.40494388341903687,
|
|
"advantages/var": 0.16397954871849052,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.984375,
|
|
"epoch": 7.977777777777778,
|
|
"grad_norm": 0.10296584659946971,
|
|
"learning_rate": 1.6205676183411732e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 225139410.0,
|
|
"reward": 0.8359375,
|
|
"reward_std": 0.0765409991145134,
|
|
"rewards/drgrpo_math_reward/mean": 0.8359375,
|
|
"rewards/drgrpo_math_reward/std": 0.371787428855896,
|
|
"step": 2777
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.33062541484832764,
|
|
"advantages/var": 0.10931316494362875,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.8125,
|
|
"epoch": 7.980645161290322,
|
|
"grad_norm": 0.05437560348032493,
|
|
"learning_rate": 1.4243316669781957e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 225224778.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.04419417306780815,
|
|
"rewards/drgrpo_math_reward/mean": 0.8125,
|
|
"rewards/drgrpo_math_reward/std": 0.39184603095054626,
|
|
"step": 2778
|
|
},
|
|
{
|
|
"advantages/mean": -4.190951585769653e-09,
|
|
"advantages/snr": 8.016207721177622e-09,
|
|
"advantages/std": 0.5228097438812256,
|
|
"advantages/var": 0.2733300282971527,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.796875,
|
|
"epoch": 7.983512544802867,
|
|
"grad_norm": 0.1326112289917123,
|
|
"learning_rate": 1.240754937420485e-10,
|
|
"loss": -0.0,
|
|
"num_tokens": 225312787.0,
|
|
"reward": 0.6796875,
|
|
"reward_std": 0.1593799889087677,
|
|
"rewards/drgrpo_math_reward/mean": 0.6796875,
|
|
"rewards/drgrpo_math_reward/std": 0.4684300124645233,
|
|
"step": 2779
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 2.2998981294712596e-09,
|
|
"advantages/std": 0.40494078397750854,
|
|
"advantages/var": 0.16397703852831924,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.921875,
|
|
"epoch": 7.986379928315412,
|
|
"grad_norm": 0.0786852035018988,
|
|
"learning_rate": 1.0698376620954518e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 225392859.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.07312605530023575,
|
|
"rewards/drgrpo_math_reward/mean": 0.9375,
|
|
"rewards/drgrpo_math_reward/std": 0.24301259219646454,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"advantages/mean": 0.0,
|
|
"advantages/snr": 0.0,
|
|
"advantages/std": 0.4049447178840637,
|
|
"advantages/var": 0.16398022454220396,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.859375,
|
|
"epoch": 7.989247311827957,
|
|
"grad_norm": 0.06871610885006592,
|
|
"learning_rate": 9.11580057402217e-11,
|
|
"loss": -0.0,
|
|
"num_tokens": 225476694.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.0776018276810646,
|
|
"rewards/drgrpo_math_reward/mean": 0.84375,
|
|
"rewards/drgrpo_math_reward/std": 0.3645188808441162,
|
|
"step": 2781
|
|
},
|
|
{
|
|
"advantages/mean": -1.862645149230957e-09,
|
|
"advantages/snr": 5.63344972278721e-09,
|
|
"advantages/std": 0.3306402266025543,
|
|
"advantages/var": 0.10932295944778847,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.90625,
|
|
"epoch": 7.992114695340502,
|
|
"grad_norm": 0.06211571730686199,
|
|
"learning_rate": 7.659823237105013e-11,
|
|
"loss": -0.0,
|
|
"num_tokens": 225557155.0,
|
|
"reward": 0.859375,
|
|
"reward_std": 0.05444391071796417,
|
|
"rewards/drgrpo_math_reward/mean": 0.859375,
|
|
"rewards/drgrpo_math_reward/std": 0.3490002751350403,
|
|
"step": 2782
|
|
},
|
|
{
|
|
"advantages/mean": -9.313225746154785e-10,
|
|
"advantages/snr": 3.983628835318941e-09,
|
|
"advantages/std": 0.23378747701644897,
|
|
"advantages/var": 0.05465658440971666,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.994982078853047,
|
|
"grad_norm": 0.04752566512581673,
|
|
"learning_rate": 6.330446453617356e-11,
|
|
"loss": 0.0,
|
|
"num_tokens": 225635250.0,
|
|
"reward": 0.9921875,
|
|
"reward_std": 0.022097086533904076,
|
|
"rewards/drgrpo_math_reward/mean": 0.9921875,
|
|
"rewards/drgrpo_math_reward/std": 0.0883883461356163,
|
|
"step": 2783
|
|
},
|
|
{
|
|
"advantages/mean": -2.7939677238464355e-09,
|
|
"advantages/snr": 8.450271315871733e-09,
|
|
"advantages/std": 0.3306364417076111,
|
|
"advantages/var": 0.1093204565850705,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"epoch": 7.997849462365592,
|
|
"grad_norm": 0.10108201830407647,
|
|
"learning_rate": 5.127671906690612e-11,
|
|
"loss": 0.0,
|
|
"num_tokens": 225718466.0,
|
|
"reward": 0.9140625,
|
|
"reward_std": 0.05102896690368652,
|
|
"rewards/drgrpo_math_reward/mean": 0.9140625,
|
|
"rewards/drgrpo_math_reward/std": 0.2813730239868164,
|
|
"step": 2784
|
|
},
|
|
{
|
|
"epoch": 7.997849462365592,
|
|
"step": 2784,
|
|
"total_flos": 0.0,
|
|
"train_loss": 1.1265189547477097e-09,
|
|
"train_runtime": 46427.9339,
|
|
"train_samples_per_second": 0.961,
|
|
"train_steps_per_second": 0.06
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 2792,
|
|
"num_input_tokens_seen": 225718466,
|
|
"num_train_epochs": 8,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|