Files
DeepSeek-R1-Distill-Qwen-7B…/trainer_state.json
ModelHub XC e72c3e4abd 初始化项目,由ModelHub XC社区提供模型
Model: leonMW/DeepSeek-R1-Distill-Qwen-7B-GSPO-Basic
Source: Original Platform
2026-04-26 01:09:06 +08:00

36334 lines
1.3 MiB

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 1900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11232.0,
"completions/mean_length": 2382.95703125,
"completions/mean_terminated_length": 1960.3902587890625,
"completions/min_length": 362.0,
"completions/min_terminated_length": 362.0,
"entropy": 0.2613159120082855,
"epoch": 0.002631578947368421,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0321967713534832,
"learning_rate": 1e-06,
"loss": 0.0443,
"num_tokens": 1623882.0,
"reward": 0.319697767496109,
"reward_std": 0.3174176812171936,
"rewards/progression_diversity/mean": -0.0028816750273108482,
"rewards/progression_diversity/std": 0.03537697717547417,
"rewards/symbolic_reward_accuracy/mean": 0.265625,
"rewards/symbolic_reward_accuracy/std": 0.44209739565849304,
"rewards/symbolic_reward_partial_score/mean": 0.5423176884651184,
"rewards/symbolic_reward_partial_score/std": 0.333414226770401,
"rewards/tag_count_reward/mean": -0.0234375,
"rewards/tag_count_reward/std": 0.15143637359142303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0534241199493408,
"sampling/importance_sampling_ratio/min": 2.7432516327974277e-11,
"sampling/sampling_logp_difference/max": 24.319292068481445,
"sampling/sampling_logp_difference/mean": 0.10465320944786072,
"step": 1
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.2668819725513458,
"epoch": 0.005263157894736842,
"grad_norm": 0.02215813286602497,
"learning_rate": 1e-06,
"loss": 0.0095,
"step": 2
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.25,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.25824375450611115,
"epoch": 0.007894736842105263,
"grad_norm": 0.02271086722612381,
"learning_rate": 1e-06,
"loss": 0.0381,
"step": 3
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.25850287079811096,
"epoch": 0.010526315789473684,
"grad_norm": 0.030293075367808342,
"learning_rate": 1e-06,
"loss": 0.0528,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13193.0,
"completions/mean_length": 2588.431640625,
"completions/mean_terminated_length": 2027.6361083984375,
"completions/min_length": 307.0,
"completions/min_terminated_length": 307.0,
"entropy": 0.2614176720380783,
"epoch": 0.013157894736842105,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.030042776837944984,
"learning_rate": 1e-06,
"loss": 0.0258,
"num_tokens": 3343623.0,
"reward": 0.38939642906188965,
"reward_std": 0.3555682599544525,
"rewards/progression_diversity/mean": -0.0007864796789363027,
"rewards/progression_diversity/std": 0.008491357788443565,
"rewards/symbolic_reward_accuracy/mean": 0.35546875,
"rewards/symbolic_reward_accuracy/std": 0.47912323474884033,
"rewards/symbolic_reward_partial_score/mean": 0.5968424081802368,
"rewards/symbolic_reward_partial_score/std": 0.35041725635528564,
"rewards/tag_count_reward/mean": -0.029296875,
"rewards/tag_count_reward/std": 0.16880230605602264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.051404595375061,
"sampling/importance_sampling_ratio/min": 0.00030985596822574735,
"sampling/sampling_logp_difference/max": 8.079402923583984,
"sampling/sampling_logp_difference/mean": 0.09999503940343857,
"step": 5
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.28125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.2622606158256531,
"epoch": 0.015789473684210527,
"grad_norm": 0.02478160709142685,
"learning_rate": 1e-06,
"loss": 0.0686,
"step": 6
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.28125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.2552778720855713,
"epoch": 0.018421052631578946,
"grad_norm": 0.02954026311635971,
"learning_rate": 1e-06,
"loss": 0.0668,
"step": 7
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.2109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.25751645863056183,
"epoch": 0.021052631578947368,
"grad_norm": 0.026725415140390396,
"learning_rate": 1e-06,
"loss": 0.0219,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12436.0,
"completions/mean_length": 2186.072265625,
"completions/mean_terminated_length": 1816.1864013671875,
"completions/min_length": 362.0,
"completions/min_terminated_length": 362.0,
"entropy": 0.2712424546480179,
"epoch": 0.02368421052631579,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.032029978930950165,
"learning_rate": 1e-06,
"loss": 0.0221,
"num_tokens": 4856012.0,
"reward": 0.4238227605819702,
"reward_std": 0.36550843715667725,
"rewards/progression_diversity/mean": -0.0005370433209463954,
"rewards/progression_diversity/std": 0.007207411807030439,
"rewards/symbolic_reward_accuracy/mean": 0.3984375,
"rewards/symbolic_reward_accuracy/std": 0.4900552034378052,
"rewards/symbolic_reward_partial_score/mean": 0.6223958134651184,
"rewards/symbolic_reward_partial_score/std": 0.3536975085735321,
"rewards/tag_count_reward/mean": -0.01953125,
"rewards/tag_count_reward/std": 0.1385180652141571,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.055936574935913,
"sampling/importance_sampling_ratio/min": 0.00310147344134748,
"sampling/sampling_logp_difference/max": 5.775877952575684,
"sampling/sampling_logp_difference/mean": 0.10886339843273163,
"step": 9
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.2743559181690216,
"epoch": 0.02631578947368421,
"grad_norm": 0.021439263597130775,
"learning_rate": 1e-06,
"loss": 0.032,
"step": 10
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.2734375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.2593652904033661,
"epoch": 0.02894736842105263,
"grad_norm": 0.030393775552511215,
"learning_rate": 1e-06,
"loss": 0.0474,
"step": 11
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.265625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.2632688581943512,
"epoch": 0.031578947368421054,
"grad_norm": 0.023602856323122978,
"learning_rate": 1e-06,
"loss": 0.0239,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13597.0,
"completions/mean_length": 2401.51953125,
"completions/mean_terminated_length": 2094.51904296875,
"completions/min_length": 366.0,
"completions/min_terminated_length": 366.0,
"entropy": 0.2705395370721817,
"epoch": 0.034210526315789476,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.027274658903479576,
"learning_rate": 1e-06,
"loss": 0.0171,
"num_tokens": 6479734.0,
"reward": 0.4843239188194275,
"reward_std": 0.356700599193573,
"rewards/progression_diversity/mean": -0.00022850481036584824,
"rewards/progression_diversity/std": 0.0038324107881635427,
"rewards/symbolic_reward_accuracy/mean": 0.482421875,
"rewards/symbolic_reward_accuracy/std": 0.5001795887947083,
"rewards/symbolic_reward_partial_score/mean": 0.6554361581802368,
"rewards/symbolic_reward_partial_score/std": 0.37485411763191223,
"rewards/tag_count_reward/mean": -0.017578125,
"rewards/tag_count_reward/std": 0.13154059648513794,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0567376613616943,
"sampling/importance_sampling_ratio/min": 0.004240815062075853,
"sampling/sampling_logp_difference/max": 5.4629998207092285,
"sampling/sampling_logp_difference/mean": 0.11016078293323517,
"step": 13
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.2603074461221695,
"epoch": 0.03684210526315789,
"grad_norm": 0.03649430721998215,
"learning_rate": 1e-06,
"loss": 0.0331,
"step": 14
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.28125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5,
"entropy": 0.2602958679199219,
"epoch": 0.039473684210526314,
"grad_norm": 0.019655603915452957,
"learning_rate": 1e-06,
"loss": 0.0592,
"step": 15
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.28125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4921875,
"entropy": 0.2742217630147934,
"epoch": 0.042105263157894736,
"grad_norm": 0.019776981323957443,
"learning_rate": 1e-06,
"loss": 0.0364,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13346.0,
"completions/mean_length": 2445.453125,
"completions/mean_terminated_length": 1966.755615234375,
"completions/min_length": 298.0,
"completions/min_terminated_length": 298.0,
"entropy": 0.2584807947278023,
"epoch": 0.04473684210526316,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.04284660518169403,
"learning_rate": 1e-06,
"loss": 0.0521,
"num_tokens": 8138942.0,
"reward": 0.5522767305374146,
"reward_std": 0.32186436653137207,
"rewards/progression_diversity/mean": -0.00181739148683846,
"rewards/progression_diversity/std": 0.018342694267630577,
"rewards/symbolic_reward_accuracy/mean": 0.568359375,
"rewards/symbolic_reward_accuracy/std": 0.4957893490791321,
"rewards/symbolic_reward_partial_score/mean": 0.71337890625,
"rewards/symbolic_reward_partial_score/std": 0.37059590220451355,
"rewards/tag_count_reward/mean": -0.02734375,
"rewards/tag_count_reward/std": 0.16324250400066376,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.053914189338684,
"sampling/importance_sampling_ratio/min": 0.004635987337678671,
"sampling/sampling_logp_difference/max": 5.373906135559082,
"sampling/sampling_logp_difference/mean": 0.10624644160270691,
"step": 17
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.26499390602111816,
"epoch": 0.04736842105263158,
"grad_norm": 0.02529755048453808,
"learning_rate": 1e-06,
"loss": 0.038,
"step": 18
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.26124098896980286,
"epoch": 0.05,
"grad_norm": 0.027479395270347595,
"learning_rate": 1e-06,
"loss": 0.0462,
"step": 19
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.2685445100069046,
"epoch": 0.05263157894736842,
"grad_norm": 0.02501635067164898,
"learning_rate": 1e-06,
"loss": 0.0469,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14541.0,
"completions/mean_length": 2491.662109375,
"completions/mean_terminated_length": 2101.1142578125,
"completions/min_length": 283.0,
"completions/min_terminated_length": 283.0,
"entropy": 0.25292401015758514,
"epoch": 0.05526315789473684,
"frac_reward_zero_std": 0.09375,
"grad_norm": 0.03069983422756195,
"learning_rate": 1e-06,
"loss": 0.0885,
"num_tokens": 9818961.0,
"reward": 0.5678541660308838,
"reward_std": 0.30515754222869873,
"rewards/progression_diversity/mean": -0.0016935247695073485,
"rewards/progression_diversity/std": 0.024840721860527992,
"rewards/symbolic_reward_accuracy/mean": 0.591796875,
"rewards/symbolic_reward_accuracy/std": 0.49198177456855774,
"rewards/symbolic_reward_partial_score/mean": 0.7164713144302368,
"rewards/symbolic_reward_partial_score/std": 0.38369399309158325,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0556615591049194,
"sampling/importance_sampling_ratio/min": 3.9937659049577425e-16,
"sampling/sampling_logp_difference/max": 35.456626892089844,
"sampling/sampling_logp_difference/mean": 0.10836475342512131,
"step": 21
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.2719355523586273,
"epoch": 0.05789473684210526,
"grad_norm": 0.02567470446228981,
"learning_rate": 1e-06,
"loss": 0.0034,
"step": 22
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.2421875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.2798517644405365,
"epoch": 0.060526315789473685,
"grad_norm": 0.01759127527475357,
"learning_rate": 1e-06,
"loss": 0.0207,
"step": 23
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.26892197132110596,
"epoch": 0.06315789473684211,
"grad_norm": 0.029061393812298775,
"learning_rate": 1e-06,
"loss": 0.0581,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15759.0,
"completions/mean_length": 2611.63671875,
"completions/mean_terminated_length": 2167.366943359375,
"completions/min_length": 293.0,
"completions/min_terminated_length": 293.0,
"entropy": 0.2623217850923538,
"epoch": 0.06578947368421052,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.04489253833889961,
"learning_rate": 1e-06,
"loss": 0.0677,
"num_tokens": 11567607.0,
"reward": 0.6099430322647095,
"reward_std": 0.3489864468574524,
"rewards/progression_diversity/mean": -0.0017921316903084517,
"rewards/progression_diversity/std": 0.023787712678313255,
"rewards/symbolic_reward_accuracy/mean": 0.642578125,
"rewards/symbolic_reward_accuracy/std": 0.4797092080116272,
"rewards/symbolic_reward_partial_score/mean": 0.755859375,
"rewards/symbolic_reward_partial_score/std": 0.3704371750354767,
"rewards/tag_count_reward/mean": -0.0234375,
"rewards/tag_count_reward/std": 0.15143637359142303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0542800426483154,
"sampling/importance_sampling_ratio/min": 0.00033611588878557086,
"sampling/sampling_logp_difference/max": 7.998054504394531,
"sampling/sampling_logp_difference/mean": 0.10523411631584167,
"step": 25
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.2667793333530426,
"epoch": 0.06842105263157895,
"grad_norm": 0.042507365345954895,
"learning_rate": 1e-06,
"loss": 0.042,
"step": 26
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4609375,
"entropy": 0.2573726028203964,
"epoch": 0.07105263157894737,
"grad_norm": 0.03612668439745903,
"learning_rate": 1e-06,
"loss": 0.0948,
"step": 27
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.2717868983745575,
"epoch": 0.07368421052631578,
"grad_norm": 0.03351793438196182,
"learning_rate": 1e-06,
"loss": 0.0522,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15168.0,
"completions/mean_length": 2566.255859375,
"completions/mean_terminated_length": 2062.775390625,
"completions/min_length": 327.0,
"completions/min_terminated_length": 327.0,
"entropy": 0.2697293758392334,
"epoch": 0.07631578947368421,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.036783039569854736,
"learning_rate": 1e-06,
"loss": 0.0337,
"num_tokens": 13282330.0,
"reward": 0.6279209852218628,
"reward_std": 0.30352213978767395,
"rewards/progression_diversity/mean": -0.0008747372776269913,
"rewards/progression_diversity/std": 0.01144934818148613,
"rewards/symbolic_reward_accuracy/mean": 0.666015625,
"rewards/symbolic_reward_accuracy/std": 0.47209542989730835,
"rewards/symbolic_reward_partial_score/mean": 0.771484375,
"rewards/symbolic_reward_partial_score/std": 0.36460402607917786,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0549585819244385,
"sampling/importance_sampling_ratio/min": 7.974162144819275e-05,
"sampling/sampling_logp_difference/max": 9.436718940734863,
"sampling/sampling_logp_difference/mean": 0.10604645311832428,
"step": 29
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.2652689218521118,
"epoch": 0.07894736842105263,
"grad_norm": 0.031599096953868866,
"learning_rate": 1e-06,
"loss": 0.0545,
"step": 30
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.2544643208384514,
"epoch": 0.08157894736842106,
"grad_norm": 0.03158700093626976,
"learning_rate": 1e-06,
"loss": 0.1139,
"step": 31
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.27170825004577637,
"epoch": 0.08421052631578947,
"grad_norm": 0.026205774396657944,
"learning_rate": 1e-06,
"loss": 0.0488,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12587.0,
"completions/mean_length": 2206.931640625,
"completions/mean_terminated_length": 1895.65869140625,
"completions/min_length": 257.0,
"completions/min_terminated_length": 257.0,
"entropy": 0.26374557614326477,
"epoch": 0.0868421052631579,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.02841874025762081,
"learning_rate": 1e-06,
"loss": 0.0425,
"num_tokens": 14809751.0,
"reward": 0.6377917528152466,
"reward_std": 0.28174924850463867,
"rewards/progression_diversity/mean": -0.00011936978262383491,
"rewards/progression_diversity/std": 0.0016766022890806198,
"rewards/symbolic_reward_accuracy/mean": 0.677734375,
"rewards/symbolic_reward_accuracy/std": 0.46780112385749817,
"rewards/symbolic_reward_partial_score/mean": 0.7744140625,
"rewards/symbolic_reward_partial_score/std": 0.37388041615486145,
"rewards/tag_count_reward/mean": -0.01171875,
"rewards/tag_count_reward/std": 0.10772226005792618,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0545893907546997,
"sampling/importance_sampling_ratio/min": 0.0025524995289742947,
"sampling/sampling_logp_difference/max": 5.970682144165039,
"sampling/sampling_logp_difference/mean": 0.1066381186246872,
"step": 33
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.2746908664703369,
"epoch": 0.08947368421052632,
"grad_norm": 0.033538322895765305,
"learning_rate": 1e-06,
"loss": 0.0432,
"step": 34
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.2674672603607178,
"epoch": 0.09210526315789473,
"grad_norm": 0.02931833639740944,
"learning_rate": 1e-06,
"loss": 0.0377,
"step": 35
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.2641746252775192,
"epoch": 0.09473684210526316,
"grad_norm": 0.038021378219127655,
"learning_rate": 1e-06,
"loss": 0.0341,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11568.0,
"completions/mean_length": 2400.705078125,
"completions/mean_terminated_length": 1949.6309814453125,
"completions/min_length": 328.0,
"completions/min_terminated_length": 328.0,
"entropy": 0.27227024734020233,
"epoch": 0.09736842105263158,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.024732865393161774,
"learning_rate": 1e-06,
"loss": 0.0088,
"num_tokens": 16416512.0,
"reward": 0.6867997050285339,
"reward_std": 0.2583235502243042,
"rewards/progression_diversity/mean": -0.0016752530355006456,
"rewards/progression_diversity/std": 0.021188421174883842,
"rewards/symbolic_reward_accuracy/mean": 0.744140625,
"rewards/symbolic_reward_accuracy/std": 0.43676990270614624,
"rewards/symbolic_reward_partial_score/mean": 0.8095703125,
"rewards/symbolic_reward_partial_score/std": 0.3585224747657776,
"rewards/tag_count_reward/mean": -0.025390625,
"rewards/tag_count_reward/std": 0.15746226906776428,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0554531812667847,
"sampling/importance_sampling_ratio/min": 1.577901821292471e-05,
"sampling/sampling_logp_difference/max": 11.056829452514648,
"sampling/sampling_logp_difference/mean": 0.10788406431674957,
"step": 37
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.27088363468647003,
"epoch": 0.1,
"grad_norm": 0.0261248666793108,
"learning_rate": 1e-06,
"loss": 0.0765,
"step": 38
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.26246321201324463,
"epoch": 0.10263157894736842,
"grad_norm": 0.019391866400837898,
"learning_rate": 1e-06,
"loss": 0.0742,
"step": 39
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.2687445729970932,
"epoch": 0.10526315789473684,
"grad_norm": 0.02644912153482437,
"learning_rate": 1e-06,
"loss": 0.0445,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14635.0,
"completions/mean_length": 2510.658203125,
"completions/mean_terminated_length": 2091.945556640625,
"completions/min_length": 345.0,
"completions/min_terminated_length": 345.0,
"entropy": 0.26613570749759674,
"epoch": 0.10789473684210527,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.0331517830491066,
"learning_rate": 1e-06,
"loss": 0.0651,
"num_tokens": 18096273.0,
"reward": 0.6728851199150085,
"reward_std": 0.25044363737106323,
"rewards/progression_diversity/mean": -0.0015264635439962149,
"rewards/progression_diversity/std": 0.01903173141181469,
"rewards/symbolic_reward_accuracy/mean": 0.72265625,
"rewards/symbolic_reward_accuracy/std": 0.4481254518032074,
"rewards/symbolic_reward_partial_score/mean": 0.8055012822151184,
"rewards/symbolic_reward_partial_score/std": 0.3548940122127533,
"rewards/tag_count_reward/mean": -0.0234375,
"rewards/tag_count_reward/std": 0.15143637359142303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.055107831954956,
"sampling/importance_sampling_ratio/min": 9.60548914008541e-06,
"sampling/sampling_logp_difference/max": 11.553175926208496,
"sampling/sampling_logp_difference/mean": 0.10785190761089325,
"step": 41
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.26286329329013824,
"epoch": 0.11052631578947368,
"grad_norm": 0.029875140637159348,
"learning_rate": 1e-06,
"loss": 0.0671,
"step": 42
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.27232979238033295,
"epoch": 0.11315789473684211,
"grad_norm": 0.026524005457758904,
"learning_rate": 1e-06,
"loss": 0.0335,
"step": 43
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.26245296001434326,
"epoch": 0.11578947368421053,
"grad_norm": 0.034007180482149124,
"learning_rate": 1e-06,
"loss": 0.0868,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10722.0,
"completions/mean_length": 2233.7578125,
"completions/mean_terminated_length": 1894.152099609375,
"completions/min_length": 327.0,
"completions/min_terminated_length": 327.0,
"entropy": 0.2722453474998474,
"epoch": 0.11842105263157894,
"frac_reward_zero_std": 0.09375,
"grad_norm": 0.057794682681560516,
"learning_rate": 1e-06,
"loss": 0.0629,
"num_tokens": 19646613.0,
"reward": 0.6934746503829956,
"reward_std": 0.30955323576927185,
"rewards/progression_diversity/mean": -0.0031212307512760162,
"rewards/progression_diversity/std": 0.02668512612581253,
"rewards/symbolic_reward_accuracy/mean": 0.75,
"rewards/symbolic_reward_accuracy/std": 0.43343618512153625,
"rewards/symbolic_reward_partial_score/mean": 0.8194986581802368,
"rewards/symbolic_reward_partial_score/std": 0.3468964397907257,
"rewards/tag_count_reward/mean": -0.0234375,
"rewards/tag_count_reward/std": 0.15143637359142303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0547754764556885,
"sampling/importance_sampling_ratio/min": 0.002088946523144841,
"sampling/sampling_logp_difference/max": 6.171095371246338,
"sampling/sampling_logp_difference/mean": 0.10662943124771118,
"step": 45
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.2671891450881958,
"epoch": 0.12105263157894737,
"grad_norm": 0.022444499656558037,
"learning_rate": 1e-06,
"loss": 0.0136,
"step": 46
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.25909220427274704,
"epoch": 0.12368421052631579,
"grad_norm": 0.023088127374649048,
"learning_rate": 1e-06,
"loss": 0.0754,
"step": 47
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.26286621391773224,
"epoch": 0.12631578947368421,
"grad_norm": 0.026823926717042923,
"learning_rate": 1e-06,
"loss": 0.0538,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11726.0,
"completions/mean_length": 2254.619140625,
"completions/mean_terminated_length": 1769.3677978515625,
"completions/min_length": 341.0,
"completions/min_terminated_length": 341.0,
"entropy": 0.2699515223503113,
"epoch": 0.12894736842105264,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.02539858967065811,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 21198770.0,
"reward": 0.674487829208374,
"reward_std": 0.2851996421813965,
"rewards/progression_diversity/mean": -0.0023922312539070845,
"rewards/progression_diversity/std": 0.024297581985592842,
"rewards/symbolic_reward_accuracy/mean": 0.728515625,
"rewards/symbolic_reward_accuracy/std": 0.44516023993492126,
"rewards/symbolic_reward_partial_score/mean": 0.8011067509651184,
"rewards/symbolic_reward_partial_score/std": 0.36674949526786804,
"rewards/tag_count_reward/mean": -0.029296875,
"rewards/tag_count_reward/std": 0.16880230605602264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0519602298736572,
"sampling/importance_sampling_ratio/min": 5.275764829005907e-16,
"sampling/sampling_logp_difference/max": 35.17823791503906,
"sampling/sampling_logp_difference/mean": 0.1013394445180893,
"step": 49
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.25432244688272476,
"epoch": 0.13157894736842105,
"grad_norm": 0.021180639043450356,
"learning_rate": 1e-06,
"loss": 0.0514,
"step": 50
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.2609899342060089,
"epoch": 0.13421052631578947,
"grad_norm": 0.021488042548298836,
"learning_rate": 1e-06,
"loss": 0.097,
"step": 51
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.24922076612710953,
"epoch": 0.1368421052631579,
"grad_norm": 0.027309391647577286,
"learning_rate": 1e-06,
"loss": 0.1207,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16107.0,
"completions/mean_length": 2299.81640625,
"completions/mean_terminated_length": 1845.48779296875,
"completions/min_length": 310.0,
"completions/min_terminated_length": 310.0,
"entropy": 0.26104749739170074,
"epoch": 0.1394736842105263,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.028313223272562027,
"learning_rate": 1e-06,
"loss": 0.0445,
"num_tokens": 22783412.0,
"reward": 0.6890961527824402,
"reward_std": 0.2640804052352905,
"rewards/progression_diversity/mean": -0.0015217037871479988,
"rewards/progression_diversity/std": 0.01678471639752388,
"rewards/symbolic_reward_accuracy/mean": 0.740234375,
"rewards/symbolic_reward_accuracy/std": 0.4389347732067108,
"rewards/symbolic_reward_partial_score/mean": 0.82568359375,
"rewards/symbolic_reward_partial_score/std": 0.3356630504131317,
"rewards/tag_count_reward/mean": -0.02734375,
"rewards/tag_count_reward/std": 0.16324250400066376,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.052910566329956,
"sampling/importance_sampling_ratio/min": 7.863413884479087e-07,
"sampling/sampling_logp_difference/max": 14.055874824523926,
"sampling/sampling_logp_difference/mean": 0.10275271534919739,
"step": 53
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.26632043719291687,
"epoch": 0.14210526315789473,
"grad_norm": 0.029039481654763222,
"learning_rate": 1e-06,
"loss": 0.0791,
"step": 54
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.2686396688222885,
"epoch": 0.14473684210526316,
"grad_norm": 0.028012612834572792,
"learning_rate": 1e-06,
"loss": 0.0624,
"step": 55
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.25890131294727325,
"epoch": 0.14736842105263157,
"grad_norm": 0.025674991309642792,
"learning_rate": 1e-06,
"loss": 0.086,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12780.0,
"completions/mean_length": 2413.14453125,
"completions/mean_terminated_length": 1874.7139892578125,
"completions/min_length": 363.0,
"completions/min_terminated_length": 363.0,
"entropy": 0.25308099389076233,
"epoch": 0.15,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.04013952985405922,
"learning_rate": 1e-06,
"loss": 0.0407,
"num_tokens": 24430270.0,
"reward": 0.6752703189849854,
"reward_std": 0.308138370513916,
"rewards/progression_diversity/mean": -0.002267459873110056,
"rewards/progression_diversity/std": 0.02433175779879093,
"rewards/symbolic_reward_accuracy/mean": 0.7265625,
"rewards/symbolic_reward_accuracy/std": 0.4461594223976135,
"rewards/symbolic_reward_partial_score/mean": 0.8076171875,
"rewards/symbolic_reward_partial_score/std": 0.35207507014274597,
"rewards/tag_count_reward/mean": -0.029296875,
"rewards/tag_count_reward/std": 0.16880230605602264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0503523349761963,
"sampling/importance_sampling_ratio/min": 7.612612193952373e-07,
"sampling/sampling_logp_difference/max": 14.088289260864258,
"sampling/sampling_logp_difference/mean": 0.09810857474803925,
"step": 57
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.2614179700613022,
"epoch": 0.15263157894736842,
"grad_norm": 0.03701889514923096,
"learning_rate": 1e-06,
"loss": 0.0238,
"step": 58
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.24882755428552628,
"epoch": 0.15526315789473685,
"grad_norm": 0.0272480770945549,
"learning_rate": 1e-06,
"loss": 0.1082,
"step": 59
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.25699031352996826,
"epoch": 0.15789473684210525,
"grad_norm": 0.03054182417690754,
"learning_rate": 1e-06,
"loss": 0.0537,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16104.0,
"completions/mean_length": 2519.41796875,
"completions/mean_terminated_length": 1955.8170166015625,
"completions/min_length": 349.0,
"completions/min_terminated_length": 349.0,
"entropy": 0.2684636861085892,
"epoch": 0.16052631578947368,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.028775321319699287,
"learning_rate": 1e-06,
"loss": 0.0122,
"num_tokens": 26120852.0,
"reward": 0.7042874693870544,
"reward_std": 0.26989448070526123,
"rewards/progression_diversity/mean": -0.0009394375374540687,
"rewards/progression_diversity/std": 0.01054183766245842,
"rewards/symbolic_reward_accuracy/mean": 0.767578125,
"rewards/symbolic_reward_accuracy/std": 0.42278963327407837,
"rewards/symbolic_reward_partial_score/mean": 0.8235676884651184,
"rewards/symbolic_reward_partial_score/std": 0.3544165790081024,
"rewards/tag_count_reward/mean": -0.033203125,
"rewards/tag_count_reward/std": 0.17934183776378632,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0519888401031494,
"sampling/importance_sampling_ratio/min": 0.0002099307457683608,
"sampling/sampling_logp_difference/max": 8.468732833862305,
"sampling/sampling_logp_difference/mean": 0.10232645273208618,
"step": 61
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.25643981993198395,
"epoch": 0.1631578947368421,
"grad_norm": 0.03552790358662605,
"learning_rate": 1e-06,
"loss": 0.1004,
"step": 62
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.2648386210203171,
"epoch": 0.16578947368421051,
"grad_norm": 0.032785214483737946,
"learning_rate": 1e-06,
"loss": 0.065,
"step": 63
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.25679877400398254,
"epoch": 0.16842105263157894,
"grad_norm": 0.027879441156983376,
"learning_rate": 1e-06,
"loss": 0.0989,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.060546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14055.0,
"completions/mean_length": 3021.4609375,
"completions/mean_terminated_length": 2160.2578125,
"completions/min_length": 387.0,
"completions/min_terminated_length": 387.0,
"entropy": 0.2547323405742645,
"epoch": 0.17105263157894737,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.036476247012615204,
"learning_rate": 1e-06,
"loss": 0.0497,
"num_tokens": 28101184.0,
"reward": 0.6189697980880737,
"reward_std": 0.3368384838104248,
"rewards/progression_diversity/mean": -0.002434882801026106,
"rewards/progression_diversity/std": 0.020372329279780388,
"rewards/symbolic_reward_accuracy/mean": 0.66796875,
"rewards/symbolic_reward_accuracy/std": 0.47140273451805115,
"rewards/symbolic_reward_partial_score/mean": 0.7449544072151184,
"rewards/symbolic_reward_partial_score/std": 0.4009448289871216,
"rewards/tag_count_reward/mean": -0.052734375,
"rewards/tag_count_reward/std": 0.22372129559516907,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.049661636352539,
"sampling/importance_sampling_ratio/min": 8.808132175019967e-13,
"sampling/sampling_logp_difference/max": 27.757930755615234,
"sampling/sampling_logp_difference/mean": 0.0967579260468483,
"step": 65
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.24418476969003677,
"epoch": 0.1736842105263158,
"grad_norm": 0.02933136560022831,
"learning_rate": 1e-06,
"loss": 0.0656,
"step": 66
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.2377062812447548,
"epoch": 0.1763157894736842,
"grad_norm": 0.028272006660699844,
"learning_rate": 1e-06,
"loss": 0.1092,
"step": 67
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.251356840133667,
"epoch": 0.17894736842105263,
"grad_norm": 0.03293720260262489,
"learning_rate": 1e-06,
"loss": 0.0516,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11239.0,
"completions/mean_length": 2362.73046875,
"completions/mean_terminated_length": 1822.35693359375,
"completions/min_length": 332.0,
"completions/min_terminated_length": 332.0,
"entropy": 0.2604978382587433,
"epoch": 0.18157894736842106,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.031264252960681915,
"learning_rate": 1e-06,
"loss": 0.0868,
"num_tokens": 29700854.0,
"reward": 0.7232824563980103,
"reward_std": 0.27414870262145996,
"rewards/progression_diversity/mean": -0.0008577151456847787,
"rewards/progression_diversity/std": 0.009135694243013859,
"rewards/symbolic_reward_accuracy/mean": 0.783203125,
"rewards/symbolic_reward_accuracy/std": 0.4124660789966583,
"rewards/symbolic_reward_partial_score/mean": 0.85498046875,
"rewards/symbolic_reward_partial_score/std": 0.3152819871902466,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0526225566864014,
"sampling/importance_sampling_ratio/min": 1.6764071233410505e-06,
"sampling/sampling_logp_difference/max": 13.298857688903809,
"sampling/sampling_logp_difference/mean": 0.10294780135154724,
"step": 69
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.2632171958684921,
"epoch": 0.18421052631578946,
"grad_norm": 0.0390724278986454,
"learning_rate": 1e-06,
"loss": 0.0445,
"step": 70
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.2604610174894333,
"epoch": 0.1868421052631579,
"grad_norm": 0.03644363954663277,
"learning_rate": 1e-06,
"loss": 0.1041,
"step": 71
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.2630729079246521,
"epoch": 0.18947368421052632,
"grad_norm": 0.02937830239534378,
"learning_rate": 1e-06,
"loss": 0.0363,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.041015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12593.0,
"completions/mean_length": 2441.689453125,
"completions/mean_terminated_length": 1845.37890625,
"completions/min_length": 319.0,
"completions/min_terminated_length": 319.0,
"entropy": 0.2574312090873718,
"epoch": 0.19210526315789472,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.025905072689056396,
"learning_rate": 1e-06,
"loss": 0.0491,
"num_tokens": 31351639.0,
"reward": 0.724668025970459,
"reward_std": 0.24668556451797485,
"rewards/progression_diversity/mean": -0.003901800373569131,
"rewards/progression_diversity/std": 0.030434370040893555,
"rewards/symbolic_reward_accuracy/mean": 0.7890625,
"rewards/symbolic_reward_accuracy/std": 0.4083731174468994,
"rewards/symbolic_reward_partial_score/mean": 0.8499348759651184,
"rewards/symbolic_reward_partial_score/std": 0.3279423713684082,
"rewards/tag_count_reward/mean": -0.037109375,
"rewards/tag_count_reward/std": 0.18921469151973724,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0521684885025024,
"sampling/importance_sampling_ratio/min": 1.6198048111149e-11,
"sampling/sampling_logp_difference/max": 24.84613037109375,
"sampling/sampling_logp_difference/mean": 0.10166233777999878,
"step": 73
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.2581082284450531,
"epoch": 0.19473684210526315,
"grad_norm": 0.022781765088438988,
"learning_rate": 1e-06,
"loss": 0.0576,
"step": 74
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.2618667483329773,
"epoch": 0.19736842105263158,
"grad_norm": 0.02607133612036705,
"learning_rate": 1e-06,
"loss": 0.0836,
"step": 75
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.25442183017730713,
"epoch": 0.2,
"grad_norm": 0.019913366064429283,
"learning_rate": 1e-06,
"loss": 0.0734,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.048828125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14265.0,
"completions/mean_length": 2636.556640625,
"completions/mean_terminated_length": 1930.8358154296875,
"completions/min_length": 358.0,
"completions/min_terminated_length": 358.0,
"entropy": 0.2596082091331482,
"epoch": 0.2026315789473684,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.02666478045284748,
"learning_rate": 1e-06,
"loss": 0.0508,
"num_tokens": 33113044.0,
"reward": 0.6970432996749878,
"reward_std": 0.27402693033218384,
"rewards/progression_diversity/mean": -0.0027059155981987715,
"rewards/progression_diversity/std": 0.023960862308740616,
"rewards/symbolic_reward_accuracy/mean": 0.75,
"rewards/symbolic_reward_accuracy/std": 0.43343618512153625,
"rewards/symbolic_reward_partial_score/mean": 0.837890625,
"rewards/symbolic_reward_partial_score/std": 0.3269626796245575,
"rewards/tag_count_reward/mean": -0.04296875,
"rewards/tag_count_reward/std": 0.2029850035905838,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0499635934829712,
"sampling/importance_sampling_ratio/min": 5.463769866764778e-06,
"sampling/sampling_logp_difference/max": 12.117371559143066,
"sampling/sampling_logp_difference/mean": 0.09771312028169632,
"step": 77
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.2613191306591034,
"epoch": 0.20526315789473684,
"grad_norm": 0.025247380137443542,
"learning_rate": 1e-06,
"loss": 0.0371,
"step": 78
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.2529866099357605,
"epoch": 0.20789473684210527,
"grad_norm": 0.030076855793595314,
"learning_rate": 1e-06,
"loss": 0.0572,
"step": 79
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.24038981646299362,
"epoch": 0.21052631578947367,
"grad_norm": 0.0339297391474247,
"learning_rate": 1e-06,
"loss": 0.1197,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.048828125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8195.0,
"completions/mean_length": 2479.08984375,
"completions/mean_terminated_length": 1765.2855224609375,
"completions/min_length": 344.0,
"completions/min_terminated_length": 344.0,
"entropy": 0.2412530928850174,
"epoch": 0.2131578947368421,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.02495790272951126,
"learning_rate": 1e-06,
"loss": 0.0973,
"num_tokens": 34795330.0,
"reward": 0.7531094551086426,
"reward_std": 0.24608904123306274,
"rewards/progression_diversity/mean": -0.0015581449260935187,
"rewards/progression_diversity/std": 0.013487322255969048,
"rewards/symbolic_reward_accuracy/mean": 0.82421875,
"rewards/symbolic_reward_accuracy/std": 0.3810062110424042,
"rewards/symbolic_reward_partial_score/mean": 0.8756510019302368,
"rewards/symbolic_reward_partial_score/std": 0.30089128017425537,
"rewards/tag_count_reward/mean": -0.041015625,
"rewards/tag_count_reward/std": 0.19852031767368317,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0499427318572998,
"sampling/importance_sampling_ratio/min": 4.466129244207195e-09,
"sampling/sampling_logp_difference/max": 19.226743698120117,
"sampling/sampling_logp_difference/mean": 0.09754176437854767,
"step": 81
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.2646462917327881,
"epoch": 0.21578947368421053,
"grad_norm": 0.02527514286339283,
"learning_rate": 1e-06,
"loss": 0.051,
"step": 82
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.2567380368709564,
"epoch": 0.21842105263157896,
"grad_norm": 0.032149430364370346,
"learning_rate": 1e-06,
"loss": 0.0786,
"step": 83
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.25139138102531433,
"epoch": 0.22105263157894736,
"grad_norm": 0.03584575280547142,
"learning_rate": 1e-06,
"loss": 0.078,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14563.0,
"completions/mean_length": 2363.416015625,
"completions/mean_terminated_length": 1793.4735107421875,
"completions/min_length": 283.0,
"completions/min_terminated_length": 283.0,
"entropy": 0.2689914107322693,
"epoch": 0.2236842105263158,
"frac_reward_zero_std": 0.21875,
"grad_norm": 0.028427930548787117,
"learning_rate": 1e-06,
"loss": 0.0551,
"num_tokens": 36393687.0,
"reward": 0.7451353073120117,
"reward_std": 0.23895391821861267,
"rewards/progression_diversity/mean": -0.003074061591178179,
"rewards/progression_diversity/std": 0.030696000903844833,
"rewards/symbolic_reward_accuracy/mean": 0.8125,
"rewards/symbolic_reward_accuracy/std": 0.39069411158561707,
"rewards/symbolic_reward_partial_score/mean": 0.87060546875,
"rewards/symbolic_reward_partial_score/std": 0.3043627142906189,
"rewards/tag_count_reward/mean": -0.03515625,
"rewards/tag_count_reward/std": 0.1843547374010086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0520421266555786,
"sampling/importance_sampling_ratio/min": 0.0007652377826161683,
"sampling/sampling_logp_difference/max": 7.175323963165283,
"sampling/sampling_logp_difference/mean": 0.10205866396427155,
"step": 85
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.26435205340385437,
"epoch": 0.22631578947368422,
"grad_norm": 0.02536817453801632,
"learning_rate": 1e-06,
"loss": 0.0536,
"step": 86
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.26483944058418274,
"epoch": 0.22894736842105262,
"grad_norm": 0.029015418142080307,
"learning_rate": 1e-06,
"loss": 0.0774,
"step": 87
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.24875866621732712,
"epoch": 0.23157894736842105,
"grad_norm": 0.04244063422083855,
"learning_rate": 1e-06,
"loss": 0.1163,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13828.0,
"completions/mean_length": 2104.162109375,
"completions/mean_terminated_length": 1673.1810302734375,
"completions/min_length": 330.0,
"completions/min_terminated_length": 330.0,
"entropy": 0.26499253511428833,
"epoch": 0.23421052631578948,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.018304875120520592,
"learning_rate": 1e-06,
"loss": 0.0338,
"num_tokens": 37858282.0,
"reward": 0.785420298576355,
"reward_std": 0.17559418082237244,
"rewards/progression_diversity/mean": -0.002899130806326866,
"rewards/progression_diversity/std": 0.025376563891768456,
"rewards/symbolic_reward_accuracy/mean": 0.857421875,
"rewards/symbolic_reward_accuracy/std": 0.3499840497970581,
"rewards/symbolic_reward_partial_score/mean": 0.9124348759651184,
"rewards/symbolic_reward_partial_score/std": 0.25231799483299255,
"rewards/tag_count_reward/mean": -0.02734375,
"rewards/tag_count_reward/std": 0.16324250400066376,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.052797794342041,
"sampling/importance_sampling_ratio/min": 3.9022021169898835e-09,
"sampling/sampling_logp_difference/max": 19.361724853515625,
"sampling/sampling_logp_difference/mean": 0.10318418592214584,
"step": 89
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.2627365291118622,
"epoch": 0.23684210526315788,
"grad_norm": 0.012172169052064419,
"learning_rate": 1e-06,
"loss": 0.0412,
"step": 90
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.2614479660987854,
"epoch": 0.2394736842105263,
"grad_norm": 0.033741675317287445,
"learning_rate": 1e-06,
"loss": 0.1004,
"step": 91
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.26432569324970245,
"epoch": 0.24210526315789474,
"grad_norm": 0.024774247780442238,
"learning_rate": 1e-06,
"loss": 0.0617,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12066.0,
"completions/mean_length": 2051.4296875,
"completions/mean_terminated_length": 1589.088623046875,
"completions/min_length": 341.0,
"completions/min_terminated_length": 341.0,
"entropy": 0.2490312159061432,
"epoch": 0.24473684210526317,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.035675231367349625,
"learning_rate": 1e-06,
"loss": 0.0635,
"num_tokens": 39296742.0,
"reward": 0.7815226316452026,
"reward_std": 0.21832206845283508,
"rewards/progression_diversity/mean": -0.002041286788880825,
"rewards/progression_diversity/std": 0.023288603872060776,
"rewards/symbolic_reward_accuracy/mean": 0.857421875,
"rewards/symbolic_reward_accuracy/std": 0.3499840497970581,
"rewards/symbolic_reward_partial_score/mean": 0.9000650644302368,
"rewards/symbolic_reward_partial_score/std": 0.2744283676147461,
"rewards/tag_count_reward/mean": -0.029296875,
"rewards/tag_count_reward/std": 0.16880230605602264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0516581535339355,
"sampling/importance_sampling_ratio/min": 8.132886725187305e-13,
"sampling/sampling_logp_difference/max": 27.837690353393555,
"sampling/sampling_logp_difference/mean": 0.10091866552829742,
"step": 93
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.2668968141078949,
"epoch": 0.24736842105263157,
"grad_norm": 0.012349041178822517,
"learning_rate": 1e-06,
"loss": 0.0034,
"step": 94
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.25852371752262115,
"epoch": 0.25,
"grad_norm": 0.030288243666291237,
"learning_rate": 1e-06,
"loss": 0.0817,
"step": 95
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.25272224843502045,
"epoch": 0.25263157894736843,
"grad_norm": 0.037656910717487335,
"learning_rate": 1e-06,
"loss": 0.0626,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12648.0,
"completions/mean_length": 1926.58984375,
"completions/mean_terminated_length": 1638.5936279296875,
"completions/min_length": 290.0,
"completions/min_terminated_length": 290.0,
"entropy": 0.2652004808187485,
"epoch": 0.25526315789473686,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.040096741169691086,
"learning_rate": 1e-06,
"loss": 0.0186,
"num_tokens": 40685300.0,
"reward": 0.7350056171417236,
"reward_std": 0.2433113306760788,
"rewards/progression_diversity/mean": -0.00041471776785328984,
"rewards/progression_diversity/std": 0.008273917250335217,
"rewards/symbolic_reward_accuracy/mean": 0.798828125,
"rewards/symbolic_reward_accuracy/std": 0.4012683033943176,
"rewards/symbolic_reward_partial_score/mean": 0.8575846552848816,
"rewards/symbolic_reward_partial_score/std": 0.31621140241622925,
"rewards/tag_count_reward/mean": -0.015625,
"rewards/tag_count_reward/std": 0.12414088100194931,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0552291870117188,
"sampling/importance_sampling_ratio/min": 8.425282566342125e-15,
"sampling/sampling_logp_difference/max": 32.40753936767578,
"sampling/sampling_logp_difference/mean": 0.10859895497560501,
"step": 97
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.26686596870422363,
"epoch": 0.2578947368421053,
"grad_norm": 0.014504051767289639,
"learning_rate": 1e-06,
"loss": 0.0306,
"step": 98
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.252104789018631,
"epoch": 0.26052631578947366,
"grad_norm": 0.025143684819340706,
"learning_rate": 1e-06,
"loss": 0.0521,
"step": 99
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.25955578684806824,
"epoch": 0.2631578947368421,
"grad_norm": 0.028664739802479744,
"learning_rate": 1e-06,
"loss": 0.0281,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11806.0,
"completions/mean_length": 2146.349609375,
"completions/mean_terminated_length": 1804.6461181640625,
"completions/min_length": 337.0,
"completions/min_terminated_length": 337.0,
"entropy": 0.25793255865573883,
"epoch": 0.2657894736842105,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.04812149703502655,
"learning_rate": 1e-06,
"loss": 0.0663,
"num_tokens": 42194055.0,
"reward": 0.7436953783035278,
"reward_std": 0.23294387757778168,
"rewards/progression_diversity/mean": -0.0005821330123580992,
"rewards/progression_diversity/std": 0.009706917218863964,
"rewards/symbolic_reward_accuracy/mean": 0.806640625,
"rewards/symbolic_reward_accuracy/std": 0.39531853795051575,
"rewards/symbolic_reward_partial_score/mean": 0.8722330331802368,
"rewards/symbolic_reward_partial_score/std": 0.2982480227947235,
"rewards/tag_count_reward/mean": -0.01953125,
"rewards/tag_count_reward/std": 0.1385180652141571,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0527888536453247,
"sampling/importance_sampling_ratio/min": 3.632110292528523e-06,
"sampling/sampling_logp_difference/max": 12.525696754455566,
"sampling/sampling_logp_difference/mean": 0.10401815176010132,
"step": 101
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.2468215674161911,
"epoch": 0.26842105263157895,
"grad_norm": 0.03846008703112602,
"learning_rate": 1e-06,
"loss": 0.0883,
"step": 102
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.2601003050804138,
"epoch": 0.2710526315789474,
"grad_norm": 0.017417294904589653,
"learning_rate": 1e-06,
"loss": 0.0133,
"step": 103
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.2558285742998123,
"epoch": 0.2736842105263158,
"grad_norm": 0.023298203945159912,
"learning_rate": 1e-06,
"loss": 0.0655,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14029.0,
"completions/mean_length": 1896.59375,
"completions/mean_terminated_length": 1608.0,
"completions/min_length": 379.0,
"completions/min_terminated_length": 379.0,
"entropy": 0.25534459948539734,
"epoch": 0.27631578947368424,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.03245214372873306,
"learning_rate": 1e-06,
"loss": 0.0691,
"num_tokens": 43574071.0,
"reward": 0.7646946907043457,
"reward_std": 0.20825007557868958,
"rewards/progression_diversity/mean": -0.00025633774930611253,
"rewards/progression_diversity/std": 0.0033328270073980093,
"rewards/symbolic_reward_accuracy/mean": 0.837890625,
"rewards/symbolic_reward_accuracy/std": 0.3689115643501282,
"rewards/symbolic_reward_partial_score/mean": 0.87841796875,
"rewards/symbolic_reward_partial_score/std": 0.30651092529296875,
"rewards/tag_count_reward/mean": -0.015625,
"rewards/tag_count_reward/std": 0.12414088100194931,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0528056621551514,
"sampling/importance_sampling_ratio/min": 0.0034063623752444983,
"sampling/sampling_logp_difference/max": 5.68211030960083,
"sampling/sampling_logp_difference/mean": 0.10405570268630981,
"step": 105
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.25604379177093506,
"epoch": 0.2789473684210526,
"grad_norm": 0.01872321590781212,
"learning_rate": 1e-06,
"loss": 0.0386,
"step": 106
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.25383952260017395,
"epoch": 0.28157894736842104,
"grad_norm": 0.019688135012984276,
"learning_rate": 1e-06,
"loss": 0.0223,
"step": 107
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.24521780759096146,
"epoch": 0.28421052631578947,
"grad_norm": 0.032386377453804016,
"learning_rate": 1e-06,
"loss": 0.0405,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14031.0,
"completions/mean_length": 2017.2890625,
"completions/mean_terminated_length": 1731.099609375,
"completions/min_length": 313.0,
"completions/min_terminated_length": 313.0,
"entropy": 0.2612617313861847,
"epoch": 0.2868421052631579,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.0264718197286129,
"learning_rate": 1e-06,
"loss": 0.0199,
"num_tokens": 45012075.0,
"reward": 0.7735224366188049,
"reward_std": 0.20979472994804382,
"rewards/progression_diversity/mean": -0.0012756988871842623,
"rewards/progression_diversity/std": 0.01783159375190735,
"rewards/symbolic_reward_accuracy/mean": 0.84765625,
"rewards/symbolic_reward_accuracy/std": 0.35970520973205566,
"rewards/symbolic_reward_partial_score/mean": 0.8876953125,
"rewards/symbolic_reward_partial_score/std": 0.29055485129356384,
"rewards/tag_count_reward/mean": -0.013671875,
"rewards/tag_count_reward/std": 0.1162383034825325,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0521562099456787,
"sampling/importance_sampling_ratio/min": 5.862594480277039e-05,
"sampling/sampling_logp_difference/max": 9.744333267211914,
"sampling/sampling_logp_difference/mean": 0.10334809869527817,
"step": 109
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.25815196335315704,
"epoch": 0.2894736842105263,
"grad_norm": 0.021397482603788376,
"learning_rate": 1e-06,
"loss": 0.0101,
"step": 110
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.2469303384423256,
"epoch": 0.29210526315789476,
"grad_norm": 0.028907410800457,
"learning_rate": 1e-06,
"loss": 0.1008,
"step": 111
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.24571603536605835,
"epoch": 0.29473684210526313,
"grad_norm": 0.043796975165605545,
"learning_rate": 1e-06,
"loss": 0.0896,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12841.0,
"completions/mean_length": 1870.52734375,
"completions/mean_terminated_length": 1581.4144287109375,
"completions/min_length": 321.0,
"completions/min_terminated_length": 321.0,
"entropy": 0.2544310688972473,
"epoch": 0.29736842105263156,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.02333790808916092,
"learning_rate": 1e-06,
"loss": 0.0079,
"num_tokens": 46372409.0,
"reward": 0.7861639857292175,
"reward_std": 0.2023507058620453,
"rewards/progression_diversity/mean": -0.0017700331518426538,
"rewards/progression_diversity/std": 0.019489416852593422,
"rewards/symbolic_reward_accuracy/mean": 0.865234375,
"rewards/symbolic_reward_accuracy/std": 0.3418070077896118,
"rewards/symbolic_reward_partial_score/mean": 0.8953450322151184,
"rewards/symbolic_reward_partial_score/std": 0.28922733664512634,
"rewards/tag_count_reward/mean": -0.015625,
"rewards/tag_count_reward/std": 0.12414088100194931,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0533685684204102,
"sampling/importance_sampling_ratio/min": 2.793473868223373e-05,
"sampling/sampling_logp_difference/max": 10.485639572143555,
"sampling/sampling_logp_difference/mean": 0.10594035685062408,
"step": 113
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.2571485936641693,
"epoch": 0.3,
"grad_norm": 0.033841848373413086,
"learning_rate": 1e-06,
"loss": 0.0585,
"step": 114
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.24314817041158676,
"epoch": 0.3026315789473684,
"grad_norm": 0.022232208400964737,
"learning_rate": 1e-06,
"loss": 0.1467,
"step": 115
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.26761798560619354,
"epoch": 0.30526315789473685,
"grad_norm": 0.012662280350923538,
"learning_rate": 1e-06,
"loss": 0.0325,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12353.0,
"completions/mean_length": 1801.455078125,
"completions/mean_terminated_length": 1628.53955078125,
"completions/min_length": 309.0,
"completions/min_terminated_length": 309.0,
"entropy": 0.24800319969654083,
"epoch": 0.3078947368421053,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.021536562591791153,
"learning_rate": 1e-06,
"loss": 0.0693,
"num_tokens": 47720738.0,
"reward": 0.7720678448677063,
"reward_std": 0.22057949006557465,
"rewards/progression_diversity/mean": -0.0002511652419343591,
"rewards/progression_diversity/std": 0.004545476287603378,
"rewards/symbolic_reward_accuracy/mean": 0.8359375,
"rewards/symbolic_reward_accuracy/std": 0.37069445848464966,
"rewards/symbolic_reward_partial_score/mean": 0.9055989980697632,
"rewards/symbolic_reward_partial_score/std": 0.2584453523159027,
"rewards/tag_count_reward/mean": -0.01171875,
"rewards/tag_count_reward/std": 0.10772226005792618,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0540456771850586,
"sampling/importance_sampling_ratio/min": 0.0003275219933129847,
"sampling/sampling_logp_difference/max": 8.023955345153809,
"sampling/sampling_logp_difference/mean": 0.10680307447910309,
"step": 117
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.2550651431083679,
"epoch": 0.3105263157894737,
"grad_norm": 0.014003097079694271,
"learning_rate": 1e-06,
"loss": 0.0051,
"step": 118
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.25681254267692566,
"epoch": 0.3131578947368421,
"grad_norm": 0.030175212770700455,
"learning_rate": 1e-06,
"loss": 0.0557,
"step": 119
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.25778651237487793,
"epoch": 0.3157894736842105,
"grad_norm": 0.020599860697984695,
"learning_rate": 1e-06,
"loss": 0.01,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11032.0,
"completions/mean_length": 2128.177734375,
"completions/mean_terminated_length": 1786.0380859375,
"completions/min_length": 348.0,
"completions/min_terminated_length": 348.0,
"entropy": 0.26357313990592957,
"epoch": 0.31842105263157894,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.032891228795051575,
"learning_rate": 1e-06,
"loss": 0.0055,
"num_tokens": 49218685.0,
"reward": 0.7386098504066467,
"reward_std": 0.24482640624046326,
"rewards/progression_diversity/mean": -0.0013221381232142448,
"rewards/progression_diversity/std": 0.017824513837695122,
"rewards/symbolic_reward_accuracy/mean": 0.798828125,
"rewards/symbolic_reward_accuracy/std": 0.4012683033943176,
"rewards/symbolic_reward_partial_score/mean": 0.87158203125,
"rewards/symbolic_reward_partial_score/std": 0.29892396926879883,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.052980661392212,
"sampling/importance_sampling_ratio/min": 0.006169522181153297,
"sampling/sampling_logp_difference/max": 5.088133811950684,
"sampling/sampling_logp_difference/mean": 0.10582087934017181,
"step": 121
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.24658922851085663,
"epoch": 0.32105263157894737,
"grad_norm": 0.024154705926775932,
"learning_rate": 1e-06,
"loss": 0.0446,
"step": 122
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.25428229570388794,
"epoch": 0.3236842105263158,
"grad_norm": 0.03276718035340309,
"learning_rate": 1e-06,
"loss": 0.0973,
"step": 123
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.2513224929571152,
"epoch": 0.3263157894736842,
"grad_norm": 0.027816835790872574,
"learning_rate": 1e-06,
"loss": 0.0737,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10545.0,
"completions/mean_length": 2134.193359375,
"completions/mean_terminated_length": 1704.11865234375,
"completions/min_length": 341.0,
"completions/min_terminated_length": 341.0,
"entropy": 0.24834615737199783,
"epoch": 0.32894736842105265,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.02625676989555359,
"learning_rate": 1e-06,
"loss": 0.0235,
"num_tokens": 50700704.0,
"reward": 0.7628857493400574,
"reward_std": 0.21416853368282318,
"rewards/progression_diversity/mean": -0.000490223930682987,
"rewards/progression_diversity/std": 0.006874173413962126,
"rewards/symbolic_reward_accuracy/mean": 0.828125,
"rewards/symbolic_reward_accuracy/std": 0.3776407241821289,
"rewards/symbolic_reward_partial_score/mean": 0.8951822519302368,
"rewards/symbolic_reward_partial_score/std": 0.26842740178108215,
"rewards/tag_count_reward/mean": -0.025390625,
"rewards/tag_count_reward/std": 0.15746226906776428,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0525569915771484,
"sampling/importance_sampling_ratio/min": 0.0012693606549873948,
"sampling/sampling_logp_difference/max": 6.669241905212402,
"sampling/sampling_logp_difference/mean": 0.10405848920345306,
"step": 125
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.2593626528978348,
"epoch": 0.33157894736842103,
"grad_norm": 0.028445186093449593,
"learning_rate": 1e-06,
"loss": 0.0248,
"step": 126
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.25416844338178635,
"epoch": 0.33421052631578946,
"grad_norm": 0.02981475181877613,
"learning_rate": 1e-06,
"loss": 0.0686,
"step": 127
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.2521153539419174,
"epoch": 0.3368421052631579,
"grad_norm": 0.03827949985861778,
"learning_rate": 1e-06,
"loss": 0.089,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11825.0,
"completions/mean_length": 2242.904296875,
"completions/mean_terminated_length": 1668.06298828125,
"completions/min_length": 319.0,
"completions/min_terminated_length": 319.0,
"entropy": 0.2524839788675308,
"epoch": 0.3394736842105263,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.0338742621243,
"learning_rate": 1e-06,
"loss": 0.0749,
"num_tokens": 52248207.0,
"reward": 0.7660582065582275,
"reward_std": 0.2173374444246292,
"rewards/progression_diversity/mean": -0.0006320271058939397,
"rewards/progression_diversity/std": 0.007326250895857811,
"rewards/symbolic_reward_accuracy/mean": 0.833984375,
"rewards/symbolic_reward_accuracy/std": 0.3724585771560669,
"rewards/symbolic_reward_partial_score/mean": 0.8966470956802368,
"rewards/symbolic_reward_partial_score/std": 0.2703816890716553,
"rewards/tag_count_reward/mean": -0.033203125,
"rewards/tag_count_reward/std": 0.17934183776378632,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0492503643035889,
"sampling/importance_sampling_ratio/min": 0.0005629548104479909,
"sampling/sampling_logp_difference/max": 7.482311248779297,
"sampling/sampling_logp_difference/mean": 0.09761972725391388,
"step": 129
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.24737834185361862,
"epoch": 0.34210526315789475,
"grad_norm": 0.027043595910072327,
"learning_rate": 1e-06,
"loss": 0.0991,
"step": 130
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.23644034564495087,
"epoch": 0.3447368421052632,
"grad_norm": 0.02878217026591301,
"learning_rate": 1e-06,
"loss": 0.0564,
"step": 131
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.24825213849544525,
"epoch": 0.3473684210526316,
"grad_norm": 0.03835158050060272,
"learning_rate": 1e-06,
"loss": 0.0377,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15272.0,
"completions/mean_length": 1937.564453125,
"completions/mean_terminated_length": 1620.377197265625,
"completions/min_length": 346.0,
"completions/min_terminated_length": 346.0,
"entropy": 0.24670010060071945,
"epoch": 0.35,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.022889573127031326,
"learning_rate": 1e-06,
"loss": 0.0667,
"num_tokens": 53642704.0,
"reward": 0.8256836533546448,
"reward_std": 0.16099657118320465,
"rewards/progression_diversity/mean": 0.0,
"rewards/progression_diversity/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.91015625,
"rewards/symbolic_reward_accuracy/std": 0.2862374484539032,
"rewards/symbolic_reward_partial_score/mean": 0.9384765625,
"rewards/symbolic_reward_partial_score/std": 0.2168099284172058,
"rewards/tag_count_reward/mean": -0.01953125,
"rewards/tag_count_reward/std": 0.1385180652141571,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0539462566375732,
"sampling/importance_sampling_ratio/min": 8.059991523623466e-05,
"sampling/sampling_logp_difference/max": 9.426012992858887,
"sampling/sampling_logp_difference/mean": 0.1068400889635086,
"step": 133
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.25666534900665283,
"epoch": 0.3526315789473684,
"grad_norm": 0.019193602725863457,
"learning_rate": 1e-06,
"loss": 0.0584,
"step": 134
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.25828083604574203,
"epoch": 0.35526315789473684,
"grad_norm": 0.025437770411372185,
"learning_rate": 1e-06,
"loss": 0.0151,
"step": 135
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.2562948167324066,
"epoch": 0.35789473684210527,
"grad_norm": 0.026910221204161644,
"learning_rate": 1e-06,
"loss": 0.047,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13533.0,
"completions/mean_length": 2228.55859375,
"completions/mean_terminated_length": 1771.931396484375,
"completions/min_length": 307.0,
"completions/min_terminated_length": 307.0,
"entropy": 0.2502548471093178,
"epoch": 0.3605263157894737,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.0259707048535347,
"learning_rate": 1e-06,
"loss": 0.0639,
"num_tokens": 55173518.0,
"reward": 0.7800472974777222,
"reward_std": 0.17662927508354187,
"rewards/progression_diversity/mean": -0.00308664096519351,
"rewards/progression_diversity/std": 0.02825590781867504,
"rewards/symbolic_reward_accuracy/mean": 0.849609375,
"rewards/symbolic_reward_accuracy/std": 0.35780346393585205,
"rewards/symbolic_reward_partial_score/mean": 0.9095051884651184,
"rewards/symbolic_reward_partial_score/std": 0.25963231921195984,
"rewards/tag_count_reward/mean": -0.025390625,
"rewards/tag_count_reward/std": 0.15746226906776428,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0506607294082642,
"sampling/importance_sampling_ratio/min": 0.00025443226331844926,
"sampling/sampling_logp_difference/max": 8.27647590637207,
"sampling/sampling_logp_difference/mean": 0.10103225708007812,
"step": 137
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.24615327268838882,
"epoch": 0.3631578947368421,
"grad_norm": 0.03407861292362213,
"learning_rate": 1e-06,
"loss": 0.0456,
"step": 138
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.24503520876169205,
"epoch": 0.36578947368421055,
"grad_norm": 0.020230667665600777,
"learning_rate": 1e-06,
"loss": 0.052,
"step": 139
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.2436116561293602,
"epoch": 0.3684210526315789,
"grad_norm": 0.033303920179605484,
"learning_rate": 1e-06,
"loss": 0.0703,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13365.0,
"completions/mean_length": 1732.8828125,
"completions/mean_terminated_length": 1500.325439453125,
"completions/min_length": 370.0,
"completions/min_terminated_length": 370.0,
"entropy": 0.25387296825647354,
"epoch": 0.37105263157894736,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.02063675969839096,
"learning_rate": 1e-06,
"loss": -0.0168,
"num_tokens": 56458706.0,
"reward": 0.8216304779052734,
"reward_std": 0.14440101385116577,
"rewards/progression_diversity/mean": -4.0891380194807425e-05,
"rewards/progression_diversity/std": 0.0009252663003280759,
"rewards/symbolic_reward_accuracy/mean": 0.900390625,
"rewards/symbolic_reward_accuracy/std": 0.29977133870124817,
"rewards/symbolic_reward_partial_score/mean": 0.9431965947151184,
"rewards/symbolic_reward_partial_score/std": 0.1989695280790329,
"rewards/tag_count_reward/mean": -0.015625,
"rewards/tag_count_reward/std": 0.12414088100194931,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0523567199707031,
"sampling/importance_sampling_ratio/min": 1.1014159326805384e-06,
"sampling/sampling_logp_difference/max": 13.718914031982422,
"sampling/sampling_logp_difference/mean": 0.10537827014923096,
"step": 141
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.24923139810562134,
"epoch": 0.3736842105263158,
"grad_norm": 0.02487753890454769,
"learning_rate": 1e-06,
"loss": 0.0127,
"step": 142
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.248472198843956,
"epoch": 0.3763157894736842,
"grad_norm": 0.017968177795410156,
"learning_rate": 1e-06,
"loss": 0.0407,
"step": 143
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.24632388353347778,
"epoch": 0.37894736842105264,
"grad_norm": 0.025321682915091515,
"learning_rate": 1e-06,
"loss": 0.0718,
"step": 144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13227.0,
"completions/mean_length": 2052.263671875,
"completions/mean_terminated_length": 1708.3021240234375,
"completions/min_length": 320.0,
"completions/min_terminated_length": 320.0,
"entropy": 0.2430901676416397,
"epoch": 0.3815789473684211,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.042146675288677216,
"learning_rate": 1e-06,
"loss": 0.069,
"num_tokens": 57902585.0,
"reward": 0.7910541296005249,
"reward_std": 0.1890793740749359,
"rewards/progression_diversity/mean": -0.0010371711105108261,
"rewards/progression_diversity/std": 0.013147015124559402,
"rewards/symbolic_reward_accuracy/mean": 0.86328125,
"rewards/symbolic_reward_accuracy/std": 0.3438861668109894,
"rewards/symbolic_reward_partial_score/mean": 0.9181314706802368,
"rewards/symbolic_reward_partial_score/std": 0.24215349555015564,
"rewards/tag_count_reward/mean": -0.0234375,
"rewards/tag_count_reward/std": 0.15143637359142303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0506622791290283,
"sampling/importance_sampling_ratio/min": 3.2121545423535736e-09,
"sampling/sampling_logp_difference/max": 19.556324005126953,
"sampling/sampling_logp_difference/mean": 0.10158580541610718,
"step": 145
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.24240544438362122,
"epoch": 0.38421052631578945,
"grad_norm": 0.023529332131147385,
"learning_rate": 1e-06,
"loss": 0.0249,
"step": 146
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.2464822679758072,
"epoch": 0.3868421052631579,
"grad_norm": 0.020634565502405167,
"learning_rate": 1e-06,
"loss": 0.0987,
"step": 147
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.2451770007610321,
"epoch": 0.3894736842105263,
"grad_norm": 0.030732842162251472,
"learning_rate": 1e-06,
"loss": 0.0308,
"step": 148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9631.0,
"completions/mean_length": 2138.572265625,
"completions/mean_terminated_length": 1679.042236328125,
"completions/min_length": 296.0,
"completions/min_terminated_length": 296.0,
"entropy": 0.2397768646478653,
"epoch": 0.39210526315789473,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.02241184562444687,
"learning_rate": 1e-06,
"loss": 0.0195,
"num_tokens": 59407358.0,
"reward": 0.7700119018554688,
"reward_std": 0.21146252751350403,
"rewards/progression_diversity/mean": -0.0007633853820152581,
"rewards/progression_diversity/std": 0.009358874522149563,
"rewards/symbolic_reward_accuracy/mean": 0.837890625,
"rewards/symbolic_reward_accuracy/std": 0.3689115643501282,
"rewards/symbolic_reward_partial_score/mean": 0.9007161855697632,
"rewards/symbolic_reward_partial_score/std": 0.2675977051258087,
"rewards/tag_count_reward/mean": -0.029296875,
"rewards/tag_count_reward/std": 0.16880230605602264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0503795146942139,
"sampling/importance_sampling_ratio/min": 1.3879385960535728e-06,
"sampling/sampling_logp_difference/max": 13.487690925598145,
"sampling/sampling_logp_difference/mean": 0.10065832734107971,
"step": 149
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.24372172355651855,
"epoch": 0.39473684210526316,
"grad_norm": 0.034816596657037735,
"learning_rate": 1e-06,
"loss": 0.0347,
"step": 150
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.2407890260219574,
"epoch": 0.3973684210526316,
"grad_norm": 0.021488770842552185,
"learning_rate": 1e-06,
"loss": 0.0215,
"step": 151
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.2366773709654808,
"epoch": 0.4,
"grad_norm": 0.02249423786997795,
"learning_rate": 1e-06,
"loss": 0.0609,
"step": 152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13010.0,
"completions/mean_length": 2161.275390625,
"completions/mean_terminated_length": 1672.8182373046875,
"completions/min_length": 338.0,
"completions/min_terminated_length": 338.0,
"entropy": 0.23152073472738266,
"epoch": 0.4026315789473684,
"frac_reward_zero_std": 0.21875,
"grad_norm": 0.03908664733171463,
"learning_rate": 1e-06,
"loss": 0.0989,
"num_tokens": 60915915.0,
"reward": 0.7834299206733704,
"reward_std": 0.21882712841033936,
"rewards/progression_diversity/mean": -0.0017377887852489948,
"rewards/progression_diversity/std": 0.021848157048225403,
"rewards/symbolic_reward_accuracy/mean": 0.859375,
"rewards/symbolic_reward_accuracy/std": 0.3479743003845215,
"rewards/symbolic_reward_partial_score/mean": 0.90185546875,
"rewards/symbolic_reward_partial_score/std": 0.2732160687446594,
"rewards/tag_count_reward/mean": -0.02734375,
"rewards/tag_count_reward/std": 0.16324250400066376,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0461015701293945,
"sampling/importance_sampling_ratio/min": 5.239376719146094e-07,
"sampling/sampling_logp_difference/max": 14.461893081665039,
"sampling/sampling_logp_difference/mean": 0.09288465231657028,
"step": 153
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.2292015701532364,
"epoch": 0.4052631578947368,
"grad_norm": 0.030572297051548958,
"learning_rate": 1e-06,
"loss": 0.1011,
"step": 154
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.23775313049554825,
"epoch": 0.40789473684210525,
"grad_norm": 0.028754258528351784,
"learning_rate": 1e-06,
"loss": 0.0394,
"step": 155
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.24100537598133087,
"epoch": 0.4105263157894737,
"grad_norm": 0.021660154685378075,
"learning_rate": 1e-06,
"loss": 0.0211,
"step": 156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13572.0,
"completions/mean_length": 1708.77734375,
"completions/mean_terminated_length": 1534.762939453125,
"completions/min_length": 309.0,
"completions/min_terminated_length": 309.0,
"entropy": 0.24858221411705017,
"epoch": 0.4131578947368421,
"frac_reward_zero_std": 0.46875,
"grad_norm": 0.022633863613009453,
"learning_rate": 1e-06,
"loss": 0.0101,
"num_tokens": 62199769.0,
"reward": 0.8241158723831177,
"reward_std": 0.1481999158859253,
"rewards/progression_diversity/mean": -0.0005272195558063686,
"rewards/progression_diversity/std": 0.010804719291627407,
"rewards/symbolic_reward_accuracy/mean": 0.90625,
"rewards/symbolic_reward_accuracy/std": 0.29176566004753113,
"rewards/symbolic_reward_partial_score/mean": 0.9384765625,
"rewards/symbolic_reward_partial_score/std": 0.21358926594257355,
"rewards/tag_count_reward/mean": -0.01171875,
"rewards/tag_count_reward/std": 0.10772226005792618,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0517460107803345,
"sampling/importance_sampling_ratio/min": 0.0012036137050017715,
"sampling/sampling_logp_difference/max": 6.722426891326904,
"sampling/sampling_logp_difference/mean": 0.10469282418489456,
"step": 157
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.2418229877948761,
"epoch": 0.41578947368421054,
"grad_norm": 0.024082506075501442,
"learning_rate": 1e-06,
"loss": 0.0616,
"step": 158
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.24460408091545105,
"epoch": 0.41842105263157897,
"grad_norm": 0.0060430532321333885,
"learning_rate": 1e-06,
"loss": 0.0383,
"step": 159
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.23893345147371292,
"epoch": 0.42105263157894735,
"grad_norm": 0.014973205514252186,
"learning_rate": 1e-06,
"loss": 0.0374,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12721.0,
"completions/mean_length": 2082.78125,
"completions/mean_terminated_length": 1768.782470703125,
"completions/min_length": 265.0,
"completions/min_terminated_length": 265.0,
"entropy": 0.23413384705781937,
"epoch": 0.4236842105263158,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.02440241537988186,
"learning_rate": 1e-06,
"loss": 0.0183,
"num_tokens": 63668137.0,
"reward": 0.8117564916610718,
"reward_std": 0.18439695239067078,
"rewards/progression_diversity/mean": -0.0011134764645248652,
"rewards/progression_diversity/std": 0.014692641794681549,
"rewards/symbolic_reward_accuracy/mean": 0.892578125,
"rewards/symbolic_reward_accuracy/std": 0.30995169281959534,
"rewards/symbolic_reward_partial_score/mean": 0.9278970956802368,
"rewards/symbolic_reward_partial_score/std": 0.23557829856872559,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.048673152923584,
"sampling/importance_sampling_ratio/min": 0.0006473406101576984,
"sampling/sampling_logp_difference/max": 7.34263801574707,
"sampling/sampling_logp_difference/mean": 0.09804990142583847,
"step": 161
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.2300802245736122,
"epoch": 0.4263157894736842,
"grad_norm": 0.02143077924847603,
"learning_rate": 1e-06,
"loss": 0.0697,
"step": 162
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.24065203219652176,
"epoch": 0.42894736842105263,
"grad_norm": 0.02908634953200817,
"learning_rate": 1e-06,
"loss": 0.0755,
"step": 163
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.24057897180318832,
"epoch": 0.43157894736842106,
"grad_norm": 0.04149603471159935,
"learning_rate": 1e-06,
"loss": 0.0485,
"step": 164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017578125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11887.0,
"completions/mean_length": 2101.234375,
"completions/mean_terminated_length": 1845.6778564453125,
"completions/min_length": 253.0,
"completions/min_terminated_length": 253.0,
"entropy": 0.23806512355804443,
"epoch": 0.4342105263157895,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.03864562138915062,
"learning_rate": 1e-06,
"loss": 0.0451,
"num_tokens": 65144289.0,
"reward": 0.7944764494895935,
"reward_std": 0.19753678143024445,
"rewards/progression_diversity/mean": -0.000595143239479512,
"rewards/progression_diversity/std": 0.010785636492073536,
"rewards/symbolic_reward_accuracy/mean": 0.8671875,
"rewards/symbolic_reward_accuracy/std": 0.33970388770103455,
"rewards/symbolic_reward_partial_score/mean": 0.91845703125,
"rewards/symbolic_reward_partial_score/std": 0.24648991227149963,
"rewards/tag_count_reward/mean": -0.013671875,
"rewards/tag_count_reward/std": 0.1162383034825325,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.049810767173767,
"sampling/importance_sampling_ratio/min": 1.5343101040343754e-05,
"sampling/sampling_logp_difference/max": 11.084844589233398,
"sampling/sampling_logp_difference/mean": 0.10091371834278107,
"step": 165
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.23936307430267334,
"epoch": 0.4368421052631579,
"grad_norm": 0.02704034186899662,
"learning_rate": 1e-06,
"loss": 0.0388,
"step": 166
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.2366686388850212,
"epoch": 0.4394736842105263,
"grad_norm": 0.016262901946902275,
"learning_rate": 1e-06,
"loss": 0.0106,
"step": 167
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.22937704622745514,
"epoch": 0.4421052631578947,
"grad_norm": 0.013757162727415562,
"learning_rate": 1e-06,
"loss": 0.0324,
"step": 168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11079.0,
"completions/mean_length": 1793.64453125,
"completions/mean_terminated_length": 1443.47607421875,
"completions/min_length": 332.0,
"completions/min_terminated_length": 332.0,
"entropy": 0.23620514571666718,
"epoch": 0.44473684210526315,
"frac_reward_zero_std": 0.46875,
"grad_norm": 0.018099864944815636,
"learning_rate": 1e-06,
"loss": 0.0122,
"num_tokens": 66446891.0,
"reward": 0.8357792496681213,
"reward_std": 0.14379340410232544,
"rewards/progression_diversity/mean": -0.0011789561249315739,
"rewards/progression_diversity/std": 0.015437182039022446,
"rewards/symbolic_reward_accuracy/mean": 0.923828125,
"rewards/symbolic_reward_accuracy/std": 0.26553234457969666,
"rewards/symbolic_reward_partial_score/mean": 0.94482421875,
"rewards/symbolic_reward_partial_score/std": 0.2143392264842987,
"rewards/tag_count_reward/mean": -0.01953125,
"rewards/tag_count_reward/std": 0.1385180652141571,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.049529790878296,
"sampling/importance_sampling_ratio/min": 0.0015639930497854948,
"sampling/sampling_logp_difference/max": 6.460513114929199,
"sampling/sampling_logp_difference/mean": 0.10102123022079468,
"step": 169
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.24607934057712555,
"epoch": 0.4473684210526316,
"grad_norm": 0.014026135206222534,
"learning_rate": 1e-06,
"loss": 0.0313,
"step": 170
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.23312748968601227,
"epoch": 0.45,
"grad_norm": 0.03011462651193142,
"learning_rate": 1e-06,
"loss": 0.0888,
"step": 171
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.23896265774965286,
"epoch": 0.45263157894736844,
"grad_norm": 0.009486875496804714,
"learning_rate": 1e-06,
"loss": 0.0374,
"step": 172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017578125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10726.0,
"completions/mean_length": 1911.1640625,
"completions/mean_terminated_length": 1652.2066650390625,
"completions/min_length": 335.0,
"completions/min_terminated_length": 335.0,
"entropy": 0.23735451698303223,
"epoch": 0.45526315789473687,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0280209518969059,
"learning_rate": 1e-06,
"loss": 0.0433,
"num_tokens": 67804351.0,
"reward": 0.8081526756286621,
"reward_std": 0.16803589463233948,
"rewards/progression_diversity/mean": -0.00016519335622433573,
"rewards/progression_diversity/std": 0.003562908386811614,
"rewards/symbolic_reward_accuracy/mean": 0.88671875,
"rewards/symbolic_reward_accuracy/std": 0.3172462284564972,
"rewards/symbolic_reward_partial_score/mean": 0.9249674081802368,
"rewards/symbolic_reward_partial_score/std": 0.2344280630350113,
"rewards/tag_count_reward/mean": -0.013671875,
"rewards/tag_count_reward/std": 0.1162383034825325,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0501618385314941,
"sampling/importance_sampling_ratio/min": 0.0007048218976706266,
"sampling/sampling_logp_difference/max": 7.257565498352051,
"sampling/sampling_logp_difference/mean": 0.10282571613788605,
"step": 173
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.2369169294834137,
"epoch": 0.45789473684210524,
"grad_norm": 0.026422906666994095,
"learning_rate": 1e-06,
"loss": 0.0448,
"step": 174
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.2406521737575531,
"epoch": 0.4605263157894737,
"grad_norm": 0.028511840850114822,
"learning_rate": 1e-06,
"loss": 0.0538,
"step": 175
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.2376062050461769,
"epoch": 0.4631578947368421,
"grad_norm": 0.029201552271842957,
"learning_rate": 1e-06,
"loss": 0.0522,
"step": 176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13314.0,
"completions/mean_length": 1935.361328125,
"completions/mean_terminated_length": 1647.5399169921875,
"completions/min_length": 328.0,
"completions/min_terminated_length": 328.0,
"entropy": 0.23343181610107422,
"epoch": 0.46578947368421053,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.036340776830911636,
"learning_rate": 1e-06,
"loss": 0.0636,
"num_tokens": 69200088.0,
"reward": 0.8155167102813721,
"reward_std": 0.1823035329580307,
"rewards/progression_diversity/mean": -0.0010712584480643272,
"rewards/progression_diversity/std": 0.016564983874559402,
"rewards/symbolic_reward_accuracy/mean": 0.89453125,
"rewards/symbolic_reward_accuracy/std": 0.3074568510055542,
"rewards/symbolic_reward_partial_score/mean": 0.9358723759651184,
"rewards/symbolic_reward_partial_score/std": 0.22121170163154602,
"rewards/tag_count_reward/mean": -0.01953125,
"rewards/tag_count_reward/std": 0.1385180652141571,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0475256443023682,
"sampling/importance_sampling_ratio/min": 1.0533493565237362e-12,
"sampling/sampling_logp_difference/max": 27.57904624938965,
"sampling/sampling_logp_difference/mean": 0.09736086428165436,
"step": 177
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.22760003805160522,
"epoch": 0.46842105263157896,
"grad_norm": 0.034887488931417465,
"learning_rate": 1e-06,
"loss": 0.0751,
"step": 178
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.22835474461317062,
"epoch": 0.4710526315789474,
"grad_norm": 0.02664666809141636,
"learning_rate": 1e-06,
"loss": 0.0702,
"step": 179
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.23141715675592422,
"epoch": 0.47368421052631576,
"grad_norm": 0.02732260897755623,
"learning_rate": 1e-06,
"loss": 0.0256,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13435.0,
"completions/mean_length": 2063.080078125,
"completions/mean_terminated_length": 1748.648681640625,
"completions/min_length": 369.0,
"completions/min_terminated_length": 369.0,
"entropy": 0.22150472551584244,
"epoch": 0.4763157894736842,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.048945918679237366,
"learning_rate": 1e-06,
"loss": 0.0967,
"num_tokens": 70669697.0,
"reward": 0.7893031239509583,
"reward_std": 0.20550626516342163,
"rewards/progression_diversity/mean": -0.00035783840576186776,
"rewards/progression_diversity/std": 0.008096958510577679,
"rewards/symbolic_reward_accuracy/mean": 0.8671875,
"rewards/symbolic_reward_accuracy/std": 0.33970388770103455,
"rewards/symbolic_reward_partial_score/mean": 0.90380859375,
"rewards/symbolic_reward_partial_score/std": 0.27613478899002075,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0472935438156128,
"sampling/importance_sampling_ratio/min": 0.004802103620022535,
"sampling/sampling_logp_difference/max": 5.338701248168945,
"sampling/sampling_logp_difference/mean": 0.09636872261762619,
"step": 181
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.22790049761533737,
"epoch": 0.4789473684210526,
"grad_norm": 0.021670255810022354,
"learning_rate": 1e-06,
"loss": 0.0223,
"step": 182
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.22305145859718323,
"epoch": 0.48157894736842105,
"grad_norm": 0.012241641990840435,
"learning_rate": 1e-06,
"loss": 0.0197,
"step": 183
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.21923468261957169,
"epoch": 0.4842105263157895,
"grad_norm": 0.029154837131500244,
"learning_rate": 1e-06,
"loss": 0.074,
"step": 184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10743.0,
"completions/mean_length": 1856.03515625,
"completions/mean_terminated_length": 1507.364013671875,
"completions/min_length": 269.0,
"completions/min_terminated_length": 269.0,
"entropy": 0.2165418118238449,
"epoch": 0.4868421052631579,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.04361012578010559,
"learning_rate": 1e-06,
"loss": 0.1,
"num_tokens": 72034643.0,
"reward": 0.8248018026351929,
"reward_std": 0.15697109699249268,
"rewards/progression_diversity/mean": -0.0002908821334131062,
"rewards/progression_diversity/std": 0.00464981934055686,
"rewards/symbolic_reward_accuracy/mean": 0.90234375,
"rewards/symbolic_reward_accuracy/std": 0.29713961482048035,
"rewards/symbolic_reward_partial_score/mean": 0.9518228769302368,
"rewards/symbolic_reward_partial_score/std": 0.18324565887451172,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0449192523956299,
"sampling/importance_sampling_ratio/min": 0.000814249215181917,
"sampling/sampling_logp_difference/max": 7.11324405670166,
"sampling/sampling_logp_difference/mean": 0.09324932098388672,
"step": 185
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.22823286801576614,
"epoch": 0.48947368421052634,
"grad_norm": 0.015330553986132145,
"learning_rate": 1e-06,
"loss": 0.0087,
"step": 186
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.22463513165712357,
"epoch": 0.4921052631578947,
"grad_norm": 0.028646033257246017,
"learning_rate": 1e-06,
"loss": 0.0249,
"step": 187
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.21679264307022095,
"epoch": 0.49473684210526314,
"grad_norm": 0.027475610375404358,
"learning_rate": 1e-06,
"loss": 0.1208,
"step": 188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14325.0,
"completions/mean_length": 1838.8671875,
"completions/mean_terminated_length": 1489.7840576171875,
"completions/min_length": 378.0,
"completions/min_terminated_length": 378.0,
"entropy": 0.2217376008629799,
"epoch": 0.49736842105263157,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.03827635198831558,
"learning_rate": 1e-06,
"loss": 0.0414,
"num_tokens": 73378767.0,
"reward": 0.8262168169021606,
"reward_std": 0.1503533273935318,
"rewards/progression_diversity/mean": -0.00038728650542907417,
"rewards/progression_diversity/std": 0.0051419991068542,
"rewards/symbolic_reward_accuracy/mean": 0.91015625,
"rewards/symbolic_reward_accuracy/std": 0.2862374484539032,
"rewards/symbolic_reward_partial_score/mean": 0.94091796875,
"rewards/symbolic_reward_partial_score/std": 0.215196430683136,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0445411205291748,
"sampling/importance_sampling_ratio/min": 8.583670023654122e-07,
"sampling/sampling_logp_difference/max": 13.968234062194824,
"sampling/sampling_logp_difference/mean": 0.09397557377815247,
"step": 189
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.21815990656614304,
"epoch": 0.5,
"grad_norm": 0.027534402906894684,
"learning_rate": 1e-06,
"loss": 0.0625,
"step": 190
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.21029751747846603,
"epoch": 0.5026315789473684,
"grad_norm": 0.028749065473675728,
"learning_rate": 1e-06,
"loss": 0.0977,
"step": 191
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.21950938552618027,
"epoch": 0.5052631578947369,
"grad_norm": 0.02661977894604206,
"learning_rate": 1e-06,
"loss": 0.0369,
"step": 192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12097.0,
"completions/mean_length": 2130.484375,
"completions/mean_terminated_length": 1817.532958984375,
"completions/min_length": 383.0,
"completions/min_terminated_length": 383.0,
"entropy": 0.2030806839466095,
"epoch": 0.5078947368421053,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.025007417425513268,
"learning_rate": 1e-06,
"loss": 0.0233,
"num_tokens": 74868871.0,
"reward": 0.8149902820587158,
"reward_std": 0.18850557506084442,
"rewards/progression_diversity/mean": 0.0,
"rewards/progression_diversity/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.900390625,
"rewards/symbolic_reward_accuracy/std": 0.29977133870124817,
"rewards/symbolic_reward_partial_score/mean": 0.92236328125,
"rewards/symbolic_reward_partial_score/std": 0.2531226873397827,
"rewards/tag_count_reward/mean": -0.01953125,
"rewards/tag_count_reward/std": 0.1385180652141571,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0420916080474854,
"sampling/importance_sampling_ratio/min": 5.281509857013589e-06,
"sampling/sampling_logp_difference/max": 12.151298522949219,
"sampling/sampling_logp_difference/mean": 0.08910438418388367,
"step": 193
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.20039298385381699,
"epoch": 0.5105263157894737,
"grad_norm": 0.019791144877672195,
"learning_rate": 1e-06,
"loss": 0.0733,
"step": 194
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.20205791294574738,
"epoch": 0.5131578947368421,
"grad_norm": 0.03766762465238571,
"learning_rate": 1e-06,
"loss": 0.054,
"step": 195
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.19714680314064026,
"epoch": 0.5157894736842106,
"grad_norm": 0.04690009355545044,
"learning_rate": 1e-06,
"loss": 0.1071,
"step": 196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11150.0,
"completions/mean_length": 1767.6796875,
"completions/mean_terminated_length": 1386.893798828125,
"completions/min_length": 319.0,
"completions/min_terminated_length": 319.0,
"entropy": 0.19719091057777405,
"epoch": 0.5184210526315789,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.031063755974173546,
"learning_rate": 1e-06,
"loss": 0.0426,
"num_tokens": 76177731.0,
"reward": 0.823818027973175,
"reward_std": 0.16009867191314697,
"rewards/progression_diversity/mean": -0.0010085422545671463,
"rewards/progression_diversity/std": 0.010762260295450687,
"rewards/symbolic_reward_accuracy/mean": 0.904296875,
"rewards/symbolic_reward_accuracy/std": 0.2944713830947876,
"rewards/symbolic_reward_partial_score/mean": 0.9459635019302368,
"rewards/symbolic_reward_partial_score/std": 0.19425591826438904,
"rewards/tag_count_reward/mean": -0.025390625,
"rewards/tag_count_reward/std": 0.15746226906776428,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0391511917114258,
"sampling/importance_sampling_ratio/min": 0.0001364499912597239,
"sampling/sampling_logp_difference/max": 8.899552345275879,
"sampling/sampling_logp_difference/mean": 0.08513116836547852,
"step": 197
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.19633030891418457,
"epoch": 0.5210526315789473,
"grad_norm": 0.018081026151776314,
"learning_rate": 1e-06,
"loss": 0.0274,
"step": 198
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.19438430666923523,
"epoch": 0.5236842105263158,
"grad_norm": 0.03003780171275139,
"learning_rate": 1e-06,
"loss": 0.0837,
"step": 199
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.1886511892080307,
"epoch": 0.5263157894736842,
"grad_norm": 0.032905541360378265,
"learning_rate": 1e-06,
"loss": 0.0643,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16023.0,
"completions/mean_length": 2301.400390625,
"completions/mean_terminated_length": 1728.9368896484375,
"completions/min_length": 343.0,
"completions/min_terminated_length": 343.0,
"entropy": 0.18248793482780457,
"epoch": 0.5289473684210526,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.04558183625340462,
"learning_rate": 1e-06,
"loss": 0.1227,
"num_tokens": 77773552.0,
"reward": 0.8007115721702576,
"reward_std": 0.21243008971214294,
"rewards/progression_diversity/mean": -0.0020881860982626677,
"rewards/progression_diversity/std": 0.021307511255145073,
"rewards/symbolic_reward_accuracy/mean": 0.880859375,
"rewards/symbolic_reward_accuracy/std": 0.32427072525024414,
"rewards/symbolic_reward_partial_score/mean": 0.9197590947151184,
"rewards/symbolic_reward_partial_score/std": 0.2490549385547638,
"rewards/tag_count_reward/mean": -0.037109375,
"rewards/tag_count_reward/std": 0.18921469151973724,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0355485677719116,
"sampling/importance_sampling_ratio/min": 1.0011056567060805e-10,
"sampling/sampling_logp_difference/max": 23.02474594116211,
"sampling/sampling_logp_difference/mean": 0.07838210463523865,
"step": 201
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.17451690137386322,
"epoch": 0.531578947368421,
"grad_norm": 0.04084772244095802,
"learning_rate": 1e-06,
"loss": 0.1141,
"step": 202
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.17165183275938034,
"epoch": 0.5342105263157895,
"grad_norm": 0.031080789864063263,
"learning_rate": 1e-06,
"loss": 0.0879,
"step": 203
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.16927115619182587,
"epoch": 0.5368421052631579,
"grad_norm": 0.016663571819663048,
"learning_rate": 1e-06,
"loss": 0.0779,
"step": 204
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16068.0,
"completions/mean_length": 2071.619140625,
"completions/mean_terminated_length": 1550.115478515625,
"completions/min_length": 358.0,
"completions/min_terminated_length": 358.0,
"entropy": 0.16112020611763,
"epoch": 0.5394736842105263,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.018799329176545143,
"learning_rate": 1e-06,
"loss": 0.0181,
"num_tokens": 79247213.0,
"reward": 0.8091070055961609,
"reward_std": 0.1729590892791748,
"rewards/progression_diversity/mean": -0.002385494764894247,
"rewards/progression_diversity/std": 0.028513550758361816,
"rewards/symbolic_reward_accuracy/mean": 0.88671875,
"rewards/symbolic_reward_accuracy/std": 0.3172462284564972,
"rewards/symbolic_reward_partial_score/mean": 0.9347330331802368,
"rewards/symbolic_reward_partial_score/std": 0.21787308156490326,
"rewards/tag_count_reward/mean": -0.033203125,
"rewards/tag_count_reward/std": 0.17934183776378632,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0297162532806396,
"sampling/importance_sampling_ratio/min": 1.7676053403192782e-06,
"sampling/sampling_logp_difference/max": 13.245884895324707,
"sampling/sampling_logp_difference/mean": 0.0692635178565979,
"step": 205
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.15220321714878082,
"epoch": 0.5421052631578948,
"grad_norm": 0.037522438913583755,
"learning_rate": 1e-06,
"loss": 0.1625,
"step": 206
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.15934187173843384,
"epoch": 0.5447368421052632,
"grad_norm": 0.013017321936786175,
"learning_rate": 1e-06,
"loss": 0.0563,
"step": 207
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.15468831360340118,
"epoch": 0.5473684210526316,
"grad_norm": 0.009292000904679298,
"learning_rate": 1e-06,
"loss": 0.0932,
"step": 208
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9757.0,
"completions/mean_length": 1568.158203125,
"completions/mean_terminated_length": 1212.5780029296875,
"completions/min_length": 291.0,
"completions/min_terminated_length": 291.0,
"entropy": 0.15928836166858673,
"epoch": 0.55,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.027132874354720116,
"learning_rate": 1e-06,
"loss": 0.0402,
"num_tokens": 80432382.0,
"reward": 0.8553647994995117,
"reward_std": 0.11348331719636917,
"rewards/progression_diversity/mean": -0.0006339521496556699,
"rewards/progression_diversity/std": 0.00838653463870287,
"rewards/symbolic_reward_accuracy/mean": 0.9453125,
"rewards/symbolic_reward_accuracy/std": 0.2275916188955307,
"rewards/symbolic_reward_partial_score/mean": 0.9690755009651184,
"rewards/symbolic_reward_partial_score/std": 0.1494486927986145,
"rewards/tag_count_reward/mean": -0.025390625,
"rewards/tag_count_reward/std": 0.15746226906776428,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0306223630905151,
"sampling/importance_sampling_ratio/min": 7.626142905792221e-05,
"sampling/sampling_logp_difference/max": 9.481343269348145,
"sampling/sampling_logp_difference/mean": 0.07124973833560944,
"step": 209
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.15551680326461792,
"epoch": 0.5526315789473685,
"grad_norm": 0.018515659496188164,
"learning_rate": 1e-06,
"loss": 0.0522,
"step": 210
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.15634237229824066,
"epoch": 0.5552631578947368,
"grad_norm": 0.012999890372157097,
"learning_rate": 1e-06,
"loss": 0.0665,
"step": 211
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.15420294553041458,
"epoch": 0.5578947368421052,
"grad_norm": 0.021096454933285713,
"learning_rate": 1e-06,
"loss": 0.131,
"step": 212
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.044921875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15015.0,
"completions/mean_length": 2094.546875,
"completions/mean_terminated_length": 1422.44580078125,
"completions/min_length": 311.0,
"completions/min_terminated_length": 311.0,
"entropy": 0.1454745978116989,
"epoch": 0.5605263157894737,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.021531764417886734,
"learning_rate": 1e-06,
"loss": 0.0801,
"num_tokens": 81900598.0,
"reward": 0.7925652265548706,
"reward_std": 0.19417259097099304,
"rewards/progression_diversity/mean": -0.0012926449999213219,
"rewards/progression_diversity/std": 0.012499667704105377,
"rewards/symbolic_reward_accuracy/mean": 0.873046875,
"rewards/symbolic_reward_accuracy/std": 0.33324605226516724,
"rewards/symbolic_reward_partial_score/mean": 0.91015625,
"rewards/symbolic_reward_partial_score/std": 0.2634425461292267,
"rewards/tag_count_reward/mean": -0.04296875,
"rewards/tag_count_reward/std": 0.2029850035905838,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0279793739318848,
"sampling/importance_sampling_ratio/min": 3.422269249670623e-13,
"sampling/sampling_logp_difference/max": 28.70330238342285,
"sampling/sampling_logp_difference/mean": 0.06678377091884613,
"step": 213
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.14702380448579788,
"epoch": 0.5631578947368421,
"grad_norm": 0.020642530173063278,
"learning_rate": 1e-06,
"loss": 0.0657,
"step": 214
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.14872030168771744,
"epoch": 0.5657894736842105,
"grad_norm": 0.016585860401391983,
"learning_rate": 1e-06,
"loss": -0.0113,
"step": 215
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.14099013805389404,
"epoch": 0.5684210526315789,
"grad_norm": 0.03327646851539612,
"learning_rate": 1e-06,
"loss": 0.1437,
"step": 216
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.048828125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12443.0,
"completions/mean_length": 2076.515625,
"completions/mean_terminated_length": 1342.045166015625,
"completions/min_length": 343.0,
"completions/min_terminated_length": 343.0,
"entropy": 0.15217551589012146,
"epoch": 0.5710526315789474,
"frac_reward_zero_std": 0.21875,
"grad_norm": 0.030961882323026657,
"learning_rate": 1e-06,
"loss": 0.0089,
"num_tokens": 83365918.0,
"reward": 0.8457842469215393,
"reward_std": 0.14305046200752258,
"rewards/progression_diversity/mean": -0.0016604659613221884,
"rewards/progression_diversity/std": 0.020232077687978745,
"rewards/symbolic_reward_accuracy/mean": 0.935546875,
"rewards/symbolic_reward_accuracy/std": 0.24579854309558868,
"rewards/symbolic_reward_partial_score/mean": 0.9638671875,
"rewards/symbolic_reward_partial_score/std": 0.15897713601589203,
"rewards/tag_count_reward/mean": -0.046875,
"rewards/tag_count_reward/std": 0.21157780289649963,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.025794267654419,
"sampling/importance_sampling_ratio/min": 1.3557089005189482e-05,
"sampling/sampling_logp_difference/max": 11.208600997924805,
"sampling/sampling_logp_difference/mean": 0.06135455146431923,
"step": 217
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.13979589194059372,
"epoch": 0.5736842105263158,
"grad_norm": 0.030766695737838745,
"learning_rate": 1e-06,
"loss": 0.136,
"step": 218
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.1349509060382843,
"epoch": 0.5763157894736842,
"grad_norm": 0.024950722232460976,
"learning_rate": 1e-06,
"loss": 0.1487,
"step": 219
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.13936427235603333,
"epoch": 0.5789473684210527,
"grad_norm": 0.025016427040100098,
"learning_rate": 1e-06,
"loss": 0.1389,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9998.0,
"completions/mean_length": 2004.419921875,
"completions/mean_terminated_length": 1450.2373046875,
"completions/min_length": 292.0,
"completions/min_terminated_length": 292.0,
"entropy": 0.13566848635673523,
"epoch": 0.5815789473684211,
"frac_reward_zero_std": 0.21875,
"grad_norm": 0.03230908513069153,
"learning_rate": 1e-06,
"loss": 0.0997,
"num_tokens": 84804853.0,
"reward": 0.8050163984298706,
"reward_std": 0.20277492702007294,
"rewards/progression_diversity/mean": -0.001296035130508244,
"rewards/progression_diversity/std": 0.020735615864396095,
"rewards/symbolic_reward_accuracy/mean": 0.884765625,
"rewards/symbolic_reward_accuracy/std": 0.3196168541908264,
"rewards/symbolic_reward_partial_score/mean": 0.9256185293197632,
"rewards/symbolic_reward_partial_score/std": 0.23007379472255707,
"rewards/tag_count_reward/mean": -0.03515625,
"rewards/tag_count_reward/std": 0.1843547374010086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0255374908447266,
"sampling/importance_sampling_ratio/min": 0.00021752847533207387,
"sampling/sampling_logp_difference/max": 8.433180809020996,
"sampling/sampling_logp_difference/mean": 0.06265204399824142,
"step": 221
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.13476034253835678,
"epoch": 0.5842105263157895,
"grad_norm": 0.038641829043626785,
"learning_rate": 1e-06,
"loss": 0.0417,
"step": 222
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.13380540907382965,
"epoch": 0.5868421052631579,
"grad_norm": 0.02986939251422882,
"learning_rate": 1e-06,
"loss": 0.0707,
"step": 223
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.13185346126556396,
"epoch": 0.5894736842105263,
"grad_norm": 0.04221319779753685,
"learning_rate": 1e-06,
"loss": 0.0909,
"step": 224
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11142.0,
"completions/mean_length": 1651.677734375,
"completions/mean_terminated_length": 1207.0401611328125,
"completions/min_length": 276.0,
"completions/min_terminated_length": 276.0,
"entropy": 0.13264429569244385,
"epoch": 0.5921052631578947,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.03359711542725563,
"learning_rate": 1e-06,
"loss": 0.0813,
"num_tokens": 86012272.0,
"reward": 0.8497519493103027,
"reward_std": 0.12195061147212982,
"rewards/progression_diversity/mean": -0.0003971385594923049,
"rewards/progression_diversity/std": 0.007108698599040508,
"rewards/symbolic_reward_accuracy/mean": 0.943359375,
"rewards/symbolic_reward_accuracy/std": 0.23138070106506348,
"rewards/symbolic_reward_partial_score/mean": 0.95556640625,
"rewards/symbolic_reward_partial_score/std": 0.19622980058193207,
"rewards/tag_count_reward/mean": -0.029296875,
"rewards/tag_count_reward/std": 0.16880230605602264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.024213194847107,
"sampling/importance_sampling_ratio/min": 1.59300361701753e-05,
"sampling/sampling_logp_difference/max": 11.047304153442383,
"sampling/sampling_logp_difference/mean": 0.06262955069541931,
"step": 225
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.13049791753292084,
"epoch": 0.5947368421052631,
"grad_norm": 0.013412282802164555,
"learning_rate": 1e-06,
"loss": 0.0659,
"step": 226
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.12776117026805878,
"epoch": 0.5973684210526315,
"grad_norm": 0.023844925686717033,
"learning_rate": 1e-06,
"loss": 0.044,
"step": 227
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.12381385266780853,
"epoch": 0.6,
"grad_norm": 0.028511611744761467,
"learning_rate": 1e-06,
"loss": 0.0867,
"step": 228
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017578125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15721.0,
"completions/mean_length": 1463.47265625,
"completions/mean_terminated_length": 1196.5048828125,
"completions/min_length": 270.0,
"completions/min_terminated_length": 270.0,
"entropy": 0.1172075942158699,
"epoch": 0.6026315789473684,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.04271606728434563,
"learning_rate": 1e-06,
"loss": 0.0551,
"num_tokens": 87146690.0,
"reward": 0.8489208221435547,
"reward_std": 0.12613338232040405,
"rewards/progression_diversity/mean": -0.0005037329392507672,
"rewards/progression_diversity/std": 0.008694959804415703,
"rewards/symbolic_reward_accuracy/mean": 0.9375,
"rewards/symbolic_reward_accuracy/std": 0.2422981858253479,
"rewards/symbolic_reward_partial_score/mean": 0.9606119394302368,
"rewards/symbolic_reward_partial_score/std": 0.17724183201789856,
"rewards/tag_count_reward/mean": -0.017578125,
"rewards/tag_count_reward/std": 0.13154059648513794,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0227758884429932,
"sampling/importance_sampling_ratio/min": 8.189291838789359e-05,
"sampling/sampling_logp_difference/max": 9.4100980758667,
"sampling/sampling_logp_difference/mean": 0.05999467894434929,
"step": 229
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.11710315942764282,
"epoch": 0.6052631578947368,
"grad_norm": 0.008632275275886059,
"learning_rate": 1e-06,
"loss": 0.0071,
"step": 230
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.11399340257048607,
"epoch": 0.6078947368421053,
"grad_norm": 0.012347784824669361,
"learning_rate": 1e-06,
"loss": 0.0357,
"step": 231
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.109375,
"entropy": 0.11190313473343849,
"epoch": 0.6105263157894737,
"grad_norm": 0.022991470992565155,
"learning_rate": 1e-06,
"loss": 0.0425,
"step": 232
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14312.0,
"completions/mean_length": 1782.529296875,
"completions/mean_terminated_length": 1311.5140380859375,
"completions/min_length": 329.0,
"completions/min_terminated_length": 329.0,
"entropy": 0.10688180476427078,
"epoch": 0.6131578947368421,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.03337698429822922,
"learning_rate": 1e-06,
"loss": 0.0355,
"num_tokens": 88459825.0,
"reward": 0.7974966168403625,
"reward_std": 0.17888271808624268,
"rewards/progression_diversity/mean": -0.00131894217338413,
"rewards/progression_diversity/std": 0.017751460894942284,
"rewards/symbolic_reward_accuracy/mean": 0.875,
"rewards/symbolic_reward_accuracy/std": 0.3310423493385315,
"rewards/symbolic_reward_partial_score/mean": 0.9187825322151184,
"rewards/symbolic_reward_partial_score/std": 0.24961021542549133,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0183002948760986,
"sampling/importance_sampling_ratio/min": 5.948247780906968e-05,
"sampling/sampling_logp_difference/max": 9.729828834533691,
"sampling/sampling_logp_difference/mean": 0.05178453028202057,
"step": 233
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.10239839181303978,
"epoch": 0.6157894736842106,
"grad_norm": 0.019929470494389534,
"learning_rate": 1e-06,
"loss": 0.0941,
"step": 234
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.09998245164752007,
"epoch": 0.618421052631579,
"grad_norm": 0.027180753648281097,
"learning_rate": 1e-06,
"loss": 0.0925,
"step": 235
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.10064789652824402,
"epoch": 0.6210526315789474,
"grad_norm": 0.032762862741947174,
"learning_rate": 1e-06,
"loss": 0.0622,
"step": 236
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15458.0,
"completions/mean_length": 1760.564453125,
"completions/mean_terminated_length": 1227.726806640625,
"completions/min_length": 367.0,
"completions/min_terminated_length": 367.0,
"entropy": 0.09621577709913254,
"epoch": 0.6236842105263158,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.03691167011857033,
"learning_rate": 1e-06,
"loss": 0.1223,
"num_tokens": 89768210.0,
"reward": 0.8165925741195679,
"reward_std": 0.1556319147348404,
"rewards/progression_diversity/mean": -0.0008989507332444191,
"rewards/progression_diversity/std": 0.016222195699810982,
"rewards/symbolic_reward_accuracy/mean": 0.90234375,
"rewards/symbolic_reward_accuracy/std": 0.29713961482048035,
"rewards/symbolic_reward_partial_score/mean": 0.9283853769302368,
"rewards/symbolic_reward_partial_score/std": 0.23884880542755127,
"rewards/tag_count_reward/mean": -0.033203125,
"rewards/tag_count_reward/std": 0.17934183776378632,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0182127952575684,
"sampling/importance_sampling_ratio/min": 0.0003808625042438507,
"sampling/sampling_logp_difference/max": 7.873072147369385,
"sampling/sampling_logp_difference/mean": 0.052828144282102585,
"step": 237
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.10177255794405937,
"epoch": 0.6263157894736842,
"grad_norm": 0.01514112763106823,
"learning_rate": 1e-06,
"loss": 0.0734,
"step": 238
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.10082132369279861,
"epoch": 0.6289473684210526,
"grad_norm": 0.02351570315659046,
"learning_rate": 1e-06,
"loss": 0.0691,
"step": 239
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.10559947416186333,
"epoch": 0.631578947368421,
"grad_norm": 0.012419568374752998,
"learning_rate": 1e-06,
"loss": 0.0251,
"step": 240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8674.0,
"completions/mean_length": 1312.580078125,
"completions/mean_terminated_length": 1073.3511962890625,
"completions/min_length": 345.0,
"completions/min_terminated_length": 345.0,
"entropy": 0.10397671908140182,
"epoch": 0.6342105263157894,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.03270323947072029,
"learning_rate": 1e-06,
"loss": 0.036,
"num_tokens": 90821019.0,
"reward": 0.8449134826660156,
"reward_std": 0.10425379872322083,
"rewards/progression_diversity/mean": -0.0008408930152654648,
"rewards/progression_diversity/std": 0.01656174287199974,
"rewards/symbolic_reward_accuracy/mean": 0.9296875,
"rewards/symbolic_reward_accuracy/std": 0.25592297315597534,
"rewards/symbolic_reward_partial_score/mean": 0.9615885615348816,
"rewards/symbolic_reward_partial_score/std": 0.16416993737220764,
"rewards/tag_count_reward/mean": -0.013671875,
"rewards/tag_count_reward/std": 0.1162383034825325,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0181472301483154,
"sampling/importance_sampling_ratio/min": 2.46100570477914e-11,
"sampling/sampling_logp_difference/max": 24.427865982055664,
"sampling/sampling_logp_difference/mean": 0.055217523127794266,
"step": 241
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.10521255061030388,
"epoch": 0.6368421052631579,
"grad_norm": 0.021176166832447052,
"learning_rate": 1e-06,
"loss": 0.0615,
"step": 242
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.10355697944760323,
"epoch": 0.6394736842105263,
"grad_norm": 0.015480482950806618,
"learning_rate": 1e-06,
"loss": 0.0404,
"step": 243
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.10299444571137428,
"epoch": 0.6421052631578947,
"grad_norm": 0.046614497900009155,
"learning_rate": 1e-06,
"loss": 0.0686,
"step": 244
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14336.0,
"completions/mean_length": 1815.578125,
"completions/mean_terminated_length": 1254.11767578125,
"completions/min_length": 307.0,
"completions/min_terminated_length": 307.0,
"entropy": 0.0950668603181839,
"epoch": 0.6447368421052632,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.03163347393274307,
"learning_rate": 1e-06,
"loss": 0.0472,
"num_tokens": 92168259.0,
"reward": 0.7833421230316162,
"reward_std": 0.1979527473449707,
"rewards/progression_diversity/mean": -0.0007511397707276046,
"rewards/progression_diversity/std": 0.010229557752609253,
"rewards/symbolic_reward_accuracy/mean": 0.85546875,
"rewards/symbolic_reward_accuracy/std": 0.35197147727012634,
"rewards/symbolic_reward_partial_score/mean": 0.91259765625,
"rewards/symbolic_reward_partial_score/std": 0.2514844834804535,
"rewards/tag_count_reward/mean": -0.037109375,
"rewards/tag_count_reward/std": 0.18921469151973724,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0157463550567627,
"sampling/importance_sampling_ratio/min": 8.007385726704896e-11,
"sampling/sampling_logp_difference/max": 23.248071670532227,
"sampling/sampling_logp_difference/mean": 0.04827887937426567,
"step": 245
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.09082784876227379,
"epoch": 0.6473684210526316,
"grad_norm": 0.03819683939218521,
"learning_rate": 1e-06,
"loss": 0.0754,
"step": 246
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.09024357795715332,
"epoch": 0.65,
"grad_norm": 0.03529435768723488,
"learning_rate": 1e-06,
"loss": 0.1095,
"step": 247
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.09114789962768555,
"epoch": 0.6526315789473685,
"grad_norm": 0.014682702720165253,
"learning_rate": 1e-06,
"loss": 0.0538,
"step": 248
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11281.0,
"completions/mean_length": 1623.357421875,
"completions/mean_terminated_length": 1147.2076416015625,
"completions/min_length": 378.0,
"completions/min_terminated_length": 378.0,
"entropy": 0.08885756507515907,
"epoch": 0.6552631578947369,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.04838551953434944,
"learning_rate": 1e-06,
"loss": 0.1366,
"num_tokens": 93418746.0,
"reward": 0.826103687286377,
"reward_std": 0.17412379384040833,
"rewards/progression_diversity/mean": -0.0019385741325095296,
"rewards/progression_diversity/std": 0.022392842918634415,
"rewards/symbolic_reward_accuracy/mean": 0.912109375,
"rewards/symbolic_reward_accuracy/std": 0.2834126651287079,
"rewards/symbolic_reward_partial_score/mean": 0.93994140625,
"rewards/symbolic_reward_partial_score/std": 0.2181263417005539,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0159519910812378,
"sampling/importance_sampling_ratio/min": 4.792552772414638e-06,
"sampling/sampling_logp_difference/max": 12.24844741821289,
"sampling/sampling_logp_difference/mean": 0.049355216324329376,
"step": 249
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.09172747284173965,
"epoch": 0.6578947368421053,
"grad_norm": 0.020697645843029022,
"learning_rate": 1e-06,
"loss": 0.0594,
"step": 250
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.09275993704795837,
"epoch": 0.6605263157894737,
"grad_norm": 0.02590845711529255,
"learning_rate": 1e-06,
"loss": 0.0501,
"step": 251
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.09400615096092224,
"epoch": 0.6631578947368421,
"grad_norm": 0.03462434932589531,
"learning_rate": 1e-06,
"loss": 0.0493,
"step": 252
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8934.0,
"completions/mean_length": 1809.392578125,
"completions/mean_terminated_length": 1278.333984375,
"completions/min_length": 340.0,
"completions/min_terminated_length": 340.0,
"entropy": 0.09200675785541534,
"epoch": 0.6657894736842105,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.04529382660984993,
"learning_rate": 1e-06,
"loss": 0.0245,
"num_tokens": 94753315.0,
"reward": 0.8259365558624268,
"reward_std": 0.13518205285072327,
"rewards/progression_diversity/mean": -0.004001125227659941,
"rewards/progression_diversity/std": 0.04852549359202385,
"rewards/symbolic_reward_accuracy/mean": 0.916015625,
"rewards/symbolic_reward_accuracy/std": 0.2776356339454651,
"rewards/symbolic_reward_partial_score/mean": 0.9329427480697632,
"rewards/symbolic_reward_partial_score/std": 0.23484937846660614,
"rewards/tag_count_reward/mean": -0.03515625,
"rewards/tag_count_reward/std": 0.1843547374010086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0148303508758545,
"sampling/importance_sampling_ratio/min": 1.1414035583356963e-07,
"sampling/sampling_logp_difference/max": 15.98583698272705,
"sampling/sampling_logp_difference/mean": 0.04799918830394745,
"step": 253
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.08636122196912766,
"epoch": 0.6684210526315789,
"grad_norm": 0.039753351360559464,
"learning_rate": 1e-06,
"loss": 0.1328,
"step": 254
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.078125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1015625,
"entropy": 0.09036806598305702,
"epoch": 0.6710526315789473,
"grad_norm": 0.02018708921968937,
"learning_rate": 1e-06,
"loss": 0.0184,
"step": 255
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.125,
"entropy": 0.08820921182632446,
"epoch": 0.6736842105263158,
"grad_norm": 0.03188847377896309,
"learning_rate": 1e-06,
"loss": 0.0965,
"step": 256
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13585.0,
"completions/mean_length": 1527.927734375,
"completions/mean_terminated_length": 1171.382080078125,
"completions/min_length": 321.0,
"completions/min_terminated_length": 321.0,
"entropy": 0.09194277971982956,
"epoch": 0.6763157894736842,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.04443329945206642,
"learning_rate": 1e-06,
"loss": 0.064,
"num_tokens": 95937918.0,
"reward": 0.8315865397453308,
"reward_std": 0.13644886016845703,
"rewards/progression_diversity/mean": -0.0005308896070346236,
"rewards/progression_diversity/std": 0.0070388540625572205,
"rewards/symbolic_reward_accuracy/mean": 0.9140625,
"rewards/symbolic_reward_accuracy/std": 0.28054583072662354,
"rewards/symbolic_reward_partial_score/mean": 0.95166015625,
"rewards/symbolic_reward_partial_score/std": 0.1864015907049179,
"rewards/tag_count_reward/mean": -0.0234375,
"rewards/tag_count_reward/std": 0.15143637359142303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.015270471572876,
"sampling/importance_sampling_ratio/min": 1.5392264726326823e-19,
"sampling/sampling_logp_difference/max": 43.31783676147461,
"sampling/sampling_logp_difference/mean": 0.04934335872530937,
"step": 257
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.08927204459905624,
"epoch": 0.6789473684210526,
"grad_norm": 0.030259989202022552,
"learning_rate": 1e-06,
"loss": 0.0617,
"step": 258
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.0892256572842598,
"epoch": 0.6815789473684211,
"grad_norm": 0.03653491288423538,
"learning_rate": 1e-06,
"loss": 0.0888,
"step": 259
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.08825241401791573,
"epoch": 0.6842105263157895,
"grad_norm": 0.02339651621878147,
"learning_rate": 1e-06,
"loss": 0.046,
"step": 260
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10342.0,
"completions/mean_length": 1722.841796875,
"completions/mean_terminated_length": 1188.629638671875,
"completions/min_length": 365.0,
"completions/min_terminated_length": 365.0,
"entropy": 0.09076549112796783,
"epoch": 0.6868421052631579,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.021180791780352592,
"learning_rate": 1e-06,
"loss": 0.0357,
"num_tokens": 97201805.0,
"reward": 0.8398886919021606,
"reward_std": 0.13938692212104797,
"rewards/progression_diversity/mean": -0.00038591912016272545,
"rewards/progression_diversity/std": 0.005922461394220591,
"rewards/symbolic_reward_accuracy/mean": 0.9296875,
"rewards/symbolic_reward_accuracy/std": 0.25592297315597534,
"rewards/symbolic_reward_partial_score/mean": 0.9519857168197632,
"rewards/symbolic_reward_partial_score/std": 0.19745849072933197,
"rewards/tag_count_reward/mean": -0.03515625,
"rewards/tag_count_reward/std": 0.1843547374010086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0142853260040283,
"sampling/importance_sampling_ratio/min": 2.9707528881317558e-08,
"sampling/sampling_logp_difference/max": 17.331865310668945,
"sampling/sampling_logp_difference/mean": 0.04651745408773422,
"step": 261
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.09219764545559883,
"epoch": 0.6894736842105263,
"grad_norm": 0.03137506544589996,
"learning_rate": 1e-06,
"loss": 0.0417,
"step": 262
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.09567607566714287,
"epoch": 0.6921052631578948,
"grad_norm": 0.022778861224651337,
"learning_rate": 1e-06,
"loss": 0.0223,
"step": 263
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.0844137892127037,
"epoch": 0.6947368421052632,
"grad_norm": 0.03902408108115196,
"learning_rate": 1e-06,
"loss": 0.1838,
"step": 264
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11656.0,
"completions/mean_length": 1918.83203125,
"completions/mean_terminated_length": 1269.37548828125,
"completions/min_length": 381.0,
"completions/min_terminated_length": 381.0,
"entropy": 0.09079957380890846,
"epoch": 0.6973684210526315,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.03956228867173195,
"learning_rate": 1e-06,
"loss": 0.0201,
"num_tokens": 98594391.0,
"reward": 0.7959408164024353,
"reward_std": 0.18128597736358643,
"rewards/progression_diversity/mean": -0.0006502005271613598,
"rewards/progression_diversity/std": 0.010507218539714813,
"rewards/symbolic_reward_accuracy/mean": 0.875,
"rewards/symbolic_reward_accuracy/std": 0.3310423493385315,
"rewards/symbolic_reward_partial_score/mean": 0.91748046875,
"rewards/symbolic_reward_partial_score/std": 0.25184011459350586,
"rewards/tag_count_reward/mean": -0.04296875,
"rewards/tag_count_reward/std": 0.2029850035905838,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0138227939605713,
"sampling/importance_sampling_ratio/min": 2.1744256395450634e-10,
"sampling/sampling_logp_difference/max": 22.249086380004883,
"sampling/sampling_logp_difference/mean": 0.04669389873743057,
"step": 265
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.0856584906578064,
"epoch": 0.7,
"grad_norm": 0.04341353103518486,
"learning_rate": 1e-06,
"loss": 0.0789,
"step": 266
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.08574041724205017,
"epoch": 0.7026315789473684,
"grad_norm": 0.04117586463689804,
"learning_rate": 1e-06,
"loss": 0.1279,
"step": 267
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.08641528338193893,
"epoch": 0.7052631578947368,
"grad_norm": 0.023934362456202507,
"learning_rate": 1e-06,
"loss": 0.0486,
"step": 268
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6950.0,
"completions/mean_length": 1519.431640625,
"completions/mean_terminated_length": 1039.929443359375,
"completions/min_length": 328.0,
"completions/min_terminated_length": 328.0,
"entropy": 0.09421858936548233,
"epoch": 0.7078947368421052,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.021700512617826462,
"learning_rate": 1e-06,
"loss": -0.0071,
"num_tokens": 99741620.0,
"reward": 0.8472155332565308,
"reward_std": 0.12852367758750916,
"rewards/progression_diversity/mean": -0.0001352034305455163,
"rewards/progression_diversity/std": 0.0023650997318327427,
"rewards/symbolic_reward_accuracy/mean": 0.939453125,
"rewards/symbolic_reward_accuracy/std": 0.2387305200099945,
"rewards/symbolic_reward_partial_score/mean": 0.95556640625,
"rewards/symbolic_reward_partial_score/std": 0.19245369732379913,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.014095664024353,
"sampling/importance_sampling_ratio/min": 0.0012474657269194722,
"sampling/sampling_logp_difference/max": 6.686641216278076,
"sampling/sampling_logp_difference/mean": 0.047991082072257996,
"step": 269
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.08904735371470451,
"epoch": 0.7105263157894737,
"grad_norm": 0.020569216459989548,
"learning_rate": 1e-06,
"loss": 0.0653,
"step": 270
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.08358699828386307,
"epoch": 0.7131578947368421,
"grad_norm": 0.036929428577423096,
"learning_rate": 1e-06,
"loss": 0.164,
"step": 271
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.125,
"entropy": 0.08975838124752045,
"epoch": 0.7157894736842105,
"grad_norm": 0.01153595745563507,
"learning_rate": 1e-06,
"loss": 0.0975,
"step": 272
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11530.0,
"completions/mean_length": 1798.42578125,
"completions/mean_terminated_length": 1205.5162353515625,
"completions/min_length": 375.0,
"completions/min_terminated_length": 375.0,
"entropy": 0.08779791370034218,
"epoch": 0.718421052631579,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.051671918481588364,
"learning_rate": 1e-06,
"loss": 0.0716,
"num_tokens": 101081902.0,
"reward": 0.8015538454055786,
"reward_std": 0.1920056939125061,
"rewards/progression_diversity/mean": -0.0008653616532683372,
"rewards/progression_diversity/std": 0.011233367957174778,
"rewards/symbolic_reward_accuracy/mean": 0.8828125,
"rewards/symbolic_reward_accuracy/std": 0.32195815443992615,
"rewards/symbolic_reward_partial_score/mean": 0.9192708134651184,
"rewards/symbolic_reward_partial_score/std": 0.24985045194625854,
"rewards/tag_count_reward/mean": -0.0390625,
"rewards/tag_count_reward/std": 0.1939331740140915,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.012866497039795,
"sampling/importance_sampling_ratio/min": 5.919900445405801e-07,
"sampling/sampling_logp_difference/max": 14.339776039123535,
"sampling/sampling_logp_difference/mean": 0.047424331307411194,
"step": 273
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.08645644038915634,
"epoch": 0.7210526315789474,
"grad_norm": 0.04649584740400314,
"learning_rate": 1e-06,
"loss": 0.0667,
"step": 274
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.08046365156769753,
"epoch": 0.7236842105263158,
"grad_norm": 0.044324759393930435,
"learning_rate": 1e-06,
"loss": 0.1699,
"step": 275
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.078125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.08621568232774734,
"epoch": 0.7263157894736842,
"grad_norm": 0.033357467502355576,
"learning_rate": 1e-06,
"loss": 0.0483,
"step": 276
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11624.0,
"completions/mean_length": 1809.94140625,
"completions/mean_terminated_length": 1309.418212890625,
"completions/min_length": 373.0,
"completions/min_terminated_length": 373.0,
"entropy": 0.08192634955048561,
"epoch": 0.7289473684210527,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.04480903223156929,
"learning_rate": 1e-06,
"loss": 0.0156,
"num_tokens": 102443600.0,
"reward": 0.7796283960342407,
"reward_std": 0.20830506086349487,
"rewards/progression_diversity/mean": -0.001035432331264019,
"rewards/progression_diversity/std": 0.010857795365154743,
"rewards/symbolic_reward_accuracy/mean": 0.853515625,
"rewards/symbolic_reward_accuracy/std": 0.35393697023391724,
"rewards/symbolic_reward_partial_score/mean": 0.9021809697151184,
"rewards/symbolic_reward_partial_score/std": 0.27368080615997314,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0123987197875977,
"sampling/importance_sampling_ratio/min": 1.9622171695829784e-17,
"sampling/sampling_logp_difference/max": 38.469871520996094,
"sampling/sampling_logp_difference/mean": 0.04502936080098152,
"step": 277
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.08124225959181786,
"epoch": 0.7315789473684211,
"grad_norm": 0.03922456502914429,
"learning_rate": 1e-06,
"loss": 0.0856,
"step": 278
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.08088827505707741,
"epoch": 0.7342105263157894,
"grad_norm": 0.029460720717906952,
"learning_rate": 1e-06,
"loss": 0.0527,
"step": 279
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.07565705850720406,
"epoch": 0.7368421052631579,
"grad_norm": 0.03428329899907112,
"learning_rate": 1e-06,
"loss": 0.1428,
"step": 280
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6622.0,
"completions/mean_length": 1865.5,
"completions/mean_terminated_length": 1213.64892578125,
"completions/min_length": 366.0,
"completions/min_terminated_length": 366.0,
"entropy": 0.08282288908958435,
"epoch": 0.7394736842105263,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.03280191496014595,
"learning_rate": 1e-06,
"loss": 0.048,
"num_tokens": 103822896.0,
"reward": 0.799896240234375,
"reward_std": 0.18378372490406036,
"rewards/progression_diversity/mean": -0.0006127399392426014,
"rewards/progression_diversity/std": 0.0077356030233204365,
"rewards/symbolic_reward_accuracy/mean": 0.876953125,
"rewards/symbolic_reward_accuracy/std": 0.32881227135658264,
"rewards/symbolic_reward_partial_score/mean": 0.9267578125,
"rewards/symbolic_reward_partial_score/std": 0.23403851687908173,
"rewards/tag_count_reward/mean": -0.04296875,
"rewards/tag_count_reward/std": 0.2029850035905838,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0123448371887207,
"sampling/importance_sampling_ratio/min": 1.5536484170297626e-06,
"sampling/sampling_logp_difference/max": 13.37490463256836,
"sampling/sampling_logp_difference/mean": 0.047334231436252594,
"step": 281
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.08162574470043182,
"epoch": 0.7421052631578947,
"grad_norm": 0.03995591774582863,
"learning_rate": 1e-06,
"loss": 0.1125,
"step": 282
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.08600019663572311,
"epoch": 0.7447368421052631,
"grad_norm": 0.02127450704574585,
"learning_rate": 1e-06,
"loss": 0.0228,
"step": 283
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.08134080097079277,
"epoch": 0.7473684210526316,
"grad_norm": 0.020339855924248695,
"learning_rate": 1e-06,
"loss": 0.1129,
"step": 284
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16203.0,
"completions/mean_length": 2301.19921875,
"completions/mean_terminated_length": 1362.345947265625,
"completions/min_length": 351.0,
"completions/min_terminated_length": 351.0,
"entropy": 0.07727199792861938,
"epoch": 0.75,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.059844452887773514,
"learning_rate": 1e-06,
"loss": 0.1395,
"num_tokens": 105415446.0,
"reward": 0.7762081623077393,
"reward_std": 0.252108633518219,
"rewards/progression_diversity/mean": -0.001261794357560575,
"rewards/progression_diversity/std": 0.012555805034935474,
"rewards/symbolic_reward_accuracy/mean": 0.861328125,
"rewards/symbolic_reward_accuracy/std": 0.34594178199768066,
"rewards/symbolic_reward_partial_score/mean": 0.8855794072151184,
"rewards/symbolic_reward_partial_score/std": 0.30380791425704956,
"rewards/tag_count_reward/mean": -0.0625,
"rewards/tag_count_reward/std": 0.2422981858253479,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0110630989074707,
"sampling/importance_sampling_ratio/min": 1.2527775652415585e-05,
"sampling/sampling_logp_difference/max": 11.287562370300293,
"sampling/sampling_logp_difference/mean": 0.04290057718753815,
"step": 285
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.07825002819299698,
"epoch": 0.7526315789473684,
"grad_norm": 0.03701354190707207,
"learning_rate": 1e-06,
"loss": 0.0698,
"step": 286
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.07666192576289177,
"epoch": 0.7552631578947369,
"grad_norm": 0.03503553569316864,
"learning_rate": 1e-06,
"loss": 0.1734,
"step": 287
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.07939735427498817,
"epoch": 0.7578947368421053,
"grad_norm": 0.03646966814994812,
"learning_rate": 1e-06,
"loss": 0.0965,
"step": 288
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12705.0,
"completions/mean_length": 2036.3125,
"completions/mean_terminated_length": 1392.1304931640625,
"completions/min_length": 357.0,
"completions/min_terminated_length": 357.0,
"entropy": 0.07943989709019661,
"epoch": 0.7605263157894737,
"frac_reward_zero_std": 0.21875,
"grad_norm": 0.054009877145290375,
"learning_rate": 1e-06,
"loss": 0.0678,
"num_tokens": 106871190.0,
"reward": 0.76430344581604,
"reward_std": 0.23938891291618347,
"rewards/progression_diversity/mean": -0.0003246946434956044,
"rewards/progression_diversity/std": 0.005887071136385202,
"rewards/symbolic_reward_accuracy/mean": 0.83984375,
"rewards/symbolic_reward_accuracy/std": 0.3671095669269562,
"rewards/symbolic_reward_partial_score/mean": 0.88232421875,
"rewards/symbolic_reward_partial_score/std": 0.3022898733615875,
"rewards/tag_count_reward/mean": -0.04296875,
"rewards/tag_count_reward/std": 0.2029850035905838,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0122061967849731,
"sampling/importance_sampling_ratio/min": 4.789482318301452e-06,
"sampling/sampling_logp_difference/max": 12.249088287353516,
"sampling/sampling_logp_difference/mean": 0.046712085604667664,
"step": 289
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.08157060295343399,
"epoch": 0.7631578947368421,
"grad_norm": 0.04651428759098053,
"learning_rate": 1e-06,
"loss": 0.0802,
"step": 290
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.07789277285337448,
"epoch": 0.7657894736842106,
"grad_norm": 0.026174401864409447,
"learning_rate": 1e-06,
"loss": 0.1247,
"step": 291
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.08365096524357796,
"epoch": 0.7684210526315789,
"grad_norm": 0.03342805802822113,
"learning_rate": 1e-06,
"loss": 0.0482,
"step": 292
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12578.0,
"completions/mean_length": 1690.671875,
"completions/mean_terminated_length": 1368.0638427734375,
"completions/min_length": 316.0,
"completions/min_terminated_length": 316.0,
"entropy": 0.08383786678314209,
"epoch": 0.7710526315789473,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.051632702350616455,
"learning_rate": 1e-06,
"loss": 0.1011,
"num_tokens": 108138638.0,
"reward": 0.8156664371490479,
"reward_std": 0.16709403693675995,
"rewards/progression_diversity/mean": -0.0007459928747266531,
"rewards/progression_diversity/std": 0.013311400078237057,
"rewards/symbolic_reward_accuracy/mean": 0.896484375,
"rewards/symbolic_reward_accuracy/std": 0.30492907762527466,
"rewards/symbolic_reward_partial_score/mean": 0.93310546875,
"rewards/symbolic_reward_partial_score/std": 0.22900764644145966,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.013049602508545,
"sampling/importance_sampling_ratio/min": 8.142708793457132e-06,
"sampling/sampling_logp_difference/max": 11.718387603759766,
"sampling/sampling_logp_difference/mean": 0.050904612988233566,
"step": 293
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.0861295573413372,
"epoch": 0.7736842105263158,
"grad_norm": 0.021426010876893997,
"learning_rate": 1e-06,
"loss": 0.0375,
"step": 294
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.08208976686000824,
"epoch": 0.7763157894736842,
"grad_norm": 0.042882923036813736,
"learning_rate": 1e-06,
"loss": 0.0493,
"step": 295
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.08664394915103912,
"epoch": 0.7789473684210526,
"grad_norm": 0.023627059534192085,
"learning_rate": 1e-06,
"loss": 0.024,
"step": 296
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11644.0,
"completions/mean_length": 1897.166015625,
"completions/mean_terminated_length": 1338.849853515625,
"completions/min_length": 369.0,
"completions/min_terminated_length": 369.0,
"entropy": 0.08488818258047104,
"epoch": 0.781578947368421,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.07350046932697296,
"learning_rate": 1e-06,
"loss": 0.098,
"num_tokens": 109518627.0,
"reward": 0.8159432411193848,
"reward_std": 0.1821536421775818,
"rewards/progression_diversity/mean": -0.0023595020174980164,
"rewards/progression_diversity/std": 0.03643770143389702,
"rewards/symbolic_reward_accuracy/mean": 0.90625,
"rewards/symbolic_reward_accuracy/std": 0.29176566004753113,
"rewards/symbolic_reward_partial_score/mean": 0.9197590947151184,
"rewards/symbolic_reward_partial_score/std": 0.26337599754333496,
"rewards/tag_count_reward/mean": -0.037109375,
"rewards/tag_count_reward/std": 0.18921469151973724,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0125732421875,
"sampling/importance_sampling_ratio/min": 1.475510165438454e-11,
"sampling/sampling_logp_difference/max": 24.93943214416504,
"sampling/sampling_logp_difference/mean": 0.04736366495490074,
"step": 297
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.07962662726640701,
"epoch": 0.7842105263157895,
"grad_norm": 0.038738593459129333,
"learning_rate": 1e-06,
"loss": 0.1205,
"step": 298
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.0848800577223301,
"epoch": 0.7868421052631579,
"grad_norm": 0.015012623742222786,
"learning_rate": 1e-06,
"loss": 0.0471,
"step": 299
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.125,
"entropy": 0.08305321261286736,
"epoch": 0.7894736842105263,
"grad_norm": 0.014111812226474285,
"learning_rate": 1e-06,
"loss": 0.0205,
"step": 300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14519.0,
"completions/mean_length": 1872.60546875,
"completions/mean_terminated_length": 1404.4959716796875,
"completions/min_length": 314.0,
"completions/min_terminated_length": 314.0,
"entropy": 0.08086536824703217,
"epoch": 0.7921052631578948,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.05553090572357178,
"learning_rate": 1e-06,
"loss": 0.0673,
"num_tokens": 110901241.0,
"reward": 0.8010022640228271,
"reward_std": 0.2108319252729416,
"rewards/progression_diversity/mean": -0.002319404622539878,
"rewards/progression_diversity/std": 0.034735988825559616,
"rewards/symbolic_reward_accuracy/mean": 0.884765625,
"rewards/symbolic_reward_accuracy/std": 0.3196168541908264,
"rewards/symbolic_reward_partial_score/mean": 0.9109700918197632,
"rewards/symbolic_reward_partial_score/std": 0.26965585350990295,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0123342275619507,
"sampling/importance_sampling_ratio/min": 5.6792261458925644e-14,
"sampling/sampling_logp_difference/max": 30.49937629699707,
"sampling/sampling_logp_difference/mean": 0.04701223969459534,
"step": 301
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.08132592588663101,
"epoch": 0.7947368421052632,
"grad_norm": 0.05231938883662224,
"learning_rate": 1e-06,
"loss": 0.0842,
"step": 302
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.0790596604347229,
"epoch": 0.7973684210526316,
"grad_norm": 0.014687119983136654,
"learning_rate": 1e-06,
"loss": 0.1099,
"step": 303
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.08050628378987312,
"epoch": 0.8,
"grad_norm": 0.027776561677455902,
"learning_rate": 1e-06,
"loss": 0.071,
"step": 304
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14410.0,
"completions/mean_length": 1649.3046875,
"completions/mean_terminated_length": 1235.0762939453125,
"completions/min_length": 349.0,
"completions/min_terminated_length": 349.0,
"entropy": 0.07759291678667068,
"epoch": 0.8026315789473685,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.05399545282125473,
"learning_rate": 1e-06,
"loss": 0.1019,
"num_tokens": 112154165.0,
"reward": 0.8109291791915894,
"reward_std": 0.18250510096549988,
"rewards/progression_diversity/mean": -0.0008340342901647091,
"rewards/progression_diversity/std": 0.014236577786505222,
"rewards/symbolic_reward_accuracy/mean": 0.89453125,
"rewards/symbolic_reward_accuracy/std": 0.3074568510055542,
"rewards/symbolic_reward_partial_score/mean": 0.9231770634651184,
"rewards/symbolic_reward_partial_score/std": 0.25037682056427,
"rewards/tag_count_reward/mean": -0.02734375,
"rewards/tag_count_reward/std": 0.16324250400066376,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0121451616287231,
"sampling/importance_sampling_ratio/min": 2.1538072954964387e-18,
"sampling/sampling_logp_difference/max": 40.67929458618164,
"sampling/sampling_logp_difference/mean": 0.04661684110760689,
"step": 305
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.08159153163433075,
"epoch": 0.8052631578947368,
"grad_norm": 0.047688040882349014,
"learning_rate": 1e-06,
"loss": 0.0852,
"step": 306
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.08121361956000328,
"epoch": 0.8078947368421052,
"grad_norm": 0.017503326758742332,
"learning_rate": 1e-06,
"loss": 0.0402,
"step": 307
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.08150652050971985,
"epoch": 0.8105263157894737,
"grad_norm": 0.0493878610432148,
"learning_rate": 1e-06,
"loss": 0.0819,
"step": 308
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9856.0,
"completions/mean_length": 1619.880859375,
"completions/mean_terminated_length": 1143.618896484375,
"completions/min_length": 375.0,
"completions/min_terminated_length": 375.0,
"entropy": 0.08610192313790321,
"epoch": 0.8131578947368421,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.05817464739084244,
"learning_rate": 1e-06,
"loss": 0.0506,
"num_tokens": 113374680.0,
"reward": 0.8267406821250916,
"reward_std": 0.16926893591880798,
"rewards/progression_diversity/mean": -0.0017180759459733963,
"rewards/progression_diversity/std": 0.03334691748023033,
"rewards/symbolic_reward_accuracy/mean": 0.9140625,
"rewards/symbolic_reward_accuracy/std": 0.28054583072662354,
"rewards/symbolic_reward_partial_score/mean": 0.9381510019302368,
"rewards/symbolic_reward_partial_score/std": 0.22628821432590485,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0124841928482056,
"sampling/importance_sampling_ratio/min": 7.453480520780431e-06,
"sampling/sampling_logp_difference/max": 11.806829452514648,
"sampling/sampling_logp_difference/mean": 0.047006309032440186,
"step": 309
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.08357677236199379,
"epoch": 0.8157894736842105,
"grad_norm": 0.032003868371248245,
"learning_rate": 1e-06,
"loss": 0.0531,
"step": 310
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.08344599604606628,
"epoch": 0.8184210526315789,
"grad_norm": 0.032237276434898376,
"learning_rate": 1e-06,
"loss": 0.0649,
"step": 311
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.08107833191752434,
"epoch": 0.8210526315789474,
"grad_norm": 0.03354015573859215,
"learning_rate": 1e-06,
"loss": 0.0988,
"step": 312
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.064453125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12625.0,
"completions/mean_length": 2280.06640625,
"completions/mean_terminated_length": 1308.396728515625,
"completions/min_length": 354.0,
"completions/min_terminated_length": 354.0,
"entropy": 0.07463454082608223,
"epoch": 0.8236842105263158,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.0700058713555336,
"learning_rate": 1e-06,
"loss": 0.0735,
"num_tokens": 114952058.0,
"reward": 0.7640925645828247,
"reward_std": 0.19802439212799072,
"rewards/progression_diversity/mean": -0.0018792236223816872,
"rewards/progression_diversity/std": 0.02314453385770321,
"rewards/symbolic_reward_accuracy/mean": 0.841796875,
"rewards/symbolic_reward_accuracy/std": 0.36528825759887695,
"rewards/symbolic_reward_partial_score/mean": 0.8849283456802368,
"rewards/symbolic_reward_partial_score/std": 0.2975020110607147,
"rewards/tag_count_reward/mean": -0.064453125,
"rewards/tag_count_reward/std": 0.24579854309558868,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.011326789855957,
"sampling/importance_sampling_ratio/min": 1.1499383845148259e-06,
"sampling/sampling_logp_difference/max": 13.675802230834961,
"sampling/sampling_logp_difference/mean": 0.04442109167575836,
"step": 313
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.0740601234138012,
"epoch": 0.8263157894736842,
"grad_norm": 0.03972548991441727,
"learning_rate": 1e-06,
"loss": 0.0904,
"step": 314
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.0788533091545105,
"epoch": 0.8289473684210527,
"grad_norm": 0.01969255320727825,
"learning_rate": 1e-06,
"loss": 0.0561,
"step": 315
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.078125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.0717739462852478,
"epoch": 0.8315789473684211,
"grad_norm": 0.019936300814151764,
"learning_rate": 1e-06,
"loss": 0.1043,
"step": 316
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7844.0,
"completions/mean_length": 2019.439453125,
"completions/mean_terminated_length": 1188.4317626953125,
"completions/min_length": 336.0,
"completions/min_terminated_length": 336.0,
"entropy": 0.07409506663680077,
"epoch": 0.8342105263157895,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.05379168689250946,
"learning_rate": 1e-06,
"loss": 0.1369,
"num_tokens": 116408507.0,
"reward": 0.794788122177124,
"reward_std": 0.20335128903388977,
"rewards/progression_diversity/mean": -0.003609658218920231,
"rewards/progression_diversity/std": 0.039214227348566055,
"rewards/symbolic_reward_accuracy/mean": 0.873046875,
"rewards/symbolic_reward_accuracy/std": 0.33324605226516724,
"rewards/symbolic_reward_partial_score/mean": 0.9215494990348816,
"rewards/symbolic_reward_partial_score/std": 0.24459369480609894,
"rewards/tag_count_reward/mean": -0.0546875,
"rewards/tag_count_reward/std": 0.2275916188955307,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0107078552246094,
"sampling/importance_sampling_ratio/min": 6.796539331332951e-09,
"sampling/sampling_logp_difference/max": 18.806852340698242,
"sampling/sampling_logp_difference/mean": 0.0424201525747776,
"step": 317
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.07684982568025589,
"epoch": 0.8368421052631579,
"grad_norm": 0.02561601623892784,
"learning_rate": 1e-06,
"loss": 0.0562,
"step": 318
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.07624761387705803,
"epoch": 0.8394736842105263,
"grad_norm": 0.04118689149618149,
"learning_rate": 1e-06,
"loss": 0.0745,
"step": 319
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.07758089527487755,
"epoch": 0.8421052631578947,
"grad_norm": 0.02611648663878441,
"learning_rate": 1e-06,
"loss": 0.084,
"step": 320
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.052734375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14539.0,
"completions/mean_length": 1871.75,
"completions/mean_terminated_length": 1063.8515625,
"completions/min_length": 292.0,
"completions/min_terminated_length": 292.0,
"entropy": 0.0785975344479084,
"epoch": 0.8447368421052631,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.01912563666701317,
"learning_rate": 1e-06,
"loss": 0.0434,
"num_tokens": 117755291.0,
"reward": 0.8071247339248657,
"reward_std": 0.1330493986606598,
"rewards/progression_diversity/mean": -0.005300430115312338,
"rewards/progression_diversity/std": 0.05674071982502937,
"rewards/symbolic_reward_accuracy/mean": 0.888671875,
"rewards/symbolic_reward_accuracy/std": 0.31484565138816833,
"rewards/symbolic_reward_partial_score/mean": 0.9308267831802368,
"rewards/symbolic_reward_partial_score/std": 0.2333908975124359,
"rewards/tag_count_reward/mean": -0.052734375,
"rewards/tag_count_reward/std": 0.22372129559516907,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0106561183929443,
"sampling/importance_sampling_ratio/min": 9.975841486209447e-09,
"sampling/sampling_logp_difference/max": 18.423099517822266,
"sampling/sampling_logp_difference/mean": 0.045750319957733154,
"step": 321
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.07452228292822838,
"epoch": 0.8473684210526315,
"grad_norm": 0.028663959354162216,
"learning_rate": 1e-06,
"loss": 0.0673,
"step": 322
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.07840859517455101,
"epoch": 0.85,
"grad_norm": 0.01813848502933979,
"learning_rate": 1e-06,
"loss": 0.0938,
"step": 323
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.07799200713634491,
"epoch": 0.8526315789473684,
"grad_norm": 0.015808766707777977,
"learning_rate": 1e-06,
"loss": 0.0841,
"step": 324
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.044921875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15622.0,
"completions/mean_length": 1765.408203125,
"completions/mean_terminated_length": 1077.826171875,
"completions/min_length": 309.0,
"completions/min_terminated_length": 309.0,
"entropy": 0.07237191870808601,
"epoch": 0.8552631578947368,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.05738206207752228,
"learning_rate": 1e-06,
"loss": 0.1679,
"num_tokens": 119072172.0,
"reward": 0.8257243633270264,
"reward_std": 0.1704496443271637,
"rewards/progression_diversity/mean": -0.0008076863596215844,
"rewards/progression_diversity/std": 0.008417648263275623,
"rewards/symbolic_reward_accuracy/mean": 0.91796875,
"rewards/symbolic_reward_accuracy/std": 0.2746807038784027,
"rewards/symbolic_reward_partial_score/mean": 0.9314778447151184,
"rewards/symbolic_reward_partial_score/std": 0.24522030353546143,
"rewards/tag_count_reward/mean": -0.044921875,
"rewards/tag_count_reward/std": 0.20733514428138733,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0103081464767456,
"sampling/importance_sampling_ratio/min": 7.582560135332983e-10,
"sampling/sampling_logp_difference/max": 21.0,
"sampling/sampling_logp_difference/mean": 0.04462500289082527,
"step": 325
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.07543528452515602,
"epoch": 0.8578947368421053,
"grad_norm": 0.022561442106962204,
"learning_rate": 1e-06,
"loss": 0.0468,
"step": 326
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.07968015596270561,
"epoch": 0.8605263157894737,
"grad_norm": 0.03309568762779236,
"learning_rate": 1e-06,
"loss": 0.0492,
"step": 327
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.08061303943395615,
"epoch": 0.8631578947368421,
"grad_norm": 0.017540300264954567,
"learning_rate": 1e-06,
"loss": 0.0731,
"step": 328
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8829.0,
"completions/mean_length": 1729.509765625,
"completions/mean_terminated_length": 1133.7987060546875,
"completions/min_length": 327.0,
"completions/min_terminated_length": 327.0,
"entropy": 0.0855712741613388,
"epoch": 0.8657894736842106,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.04006322845816612,
"learning_rate": 1e-06,
"loss": 0.0258,
"num_tokens": 120356817.0,
"reward": 0.7963314652442932,
"reward_std": 0.1821439266204834,
"rewards/progression_diversity/mean": -0.0006411472568288445,
"rewards/progression_diversity/std": 0.013062160462141037,
"rewards/symbolic_reward_accuracy/mean": 0.875,
"rewards/symbolic_reward_accuracy/std": 0.3310423493385315,
"rewards/symbolic_reward_partial_score/mean": 0.9181314706802368,
"rewards/symbolic_reward_partial_score/std": 0.24649207293987274,
"rewards/tag_count_reward/mean": -0.041015625,
"rewards/tag_count_reward/std": 0.19852031767368317,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.011404275894165,
"sampling/importance_sampling_ratio/min": 2.223772499476695e-09,
"sampling/sampling_logp_difference/max": 19.924060821533203,
"sampling/sampling_logp_difference/mean": 0.045835498720407486,
"step": 329
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.07828561961650848,
"epoch": 0.868421052631579,
"grad_norm": 0.032283343374729156,
"learning_rate": 1e-06,
"loss": 0.093,
"step": 330
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.08197357133030891,
"epoch": 0.8710526315789474,
"grad_norm": 0.04965886473655701,
"learning_rate": 1e-06,
"loss": 0.0956,
"step": 331
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.08033063635230064,
"epoch": 0.8736842105263158,
"grad_norm": 0.029129937291145325,
"learning_rate": 1e-06,
"loss": 0.0689,
"step": 332
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9487.0,
"completions/mean_length": 1397.1328125,
"completions/mean_terminated_length": 1006.6934204101562,
"completions/min_length": 306.0,
"completions/min_terminated_length": 306.0,
"entropy": 0.08633046597242355,
"epoch": 0.8763157894736842,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.034233320504426956,
"learning_rate": 1e-06,
"loss": 0.0423,
"num_tokens": 121475957.0,
"reward": 0.8381311893463135,
"reward_std": 0.1662929654121399,
"rewards/progression_diversity/mean": -0.0003610485000535846,
"rewards/progression_diversity/std": 0.006844029296189547,
"rewards/symbolic_reward_accuracy/mean": 0.927734375,
"rewards/symbolic_reward_accuracy/std": 0.2591804563999176,
"rewards/symbolic_reward_partial_score/mean": 0.94677734375,
"rewards/symbolic_reward_partial_score/std": 0.2108106017112732,
"rewards/tag_count_reward/mean": -0.025390625,
"rewards/tag_count_reward/std": 0.15746226906776428,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0122419595718384,
"sampling/importance_sampling_ratio/min": 3.6730430110765155e-06,
"sampling/sampling_logp_difference/max": 12.514490127563477,
"sampling/sampling_logp_difference/mean": 0.047133252024650574,
"step": 333
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.08650662377476692,
"epoch": 0.8789473684210526,
"grad_norm": 0.013222447596490383,
"learning_rate": 1e-06,
"loss": 0.0508,
"step": 334
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.08855986595153809,
"epoch": 0.881578947368421,
"grad_norm": 0.010891803540289402,
"learning_rate": 1e-06,
"loss": 0.0263,
"step": 335
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.078125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.140625,
"entropy": 0.0886671282351017,
"epoch": 0.8842105263157894,
"grad_norm": 0.014351065270602703,
"learning_rate": 1e-06,
"loss": 0.0698,
"step": 336
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.041015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15569.0,
"completions/mean_length": 1693.0078125,
"completions/mean_terminated_length": 1064.67626953125,
"completions/min_length": 292.0,
"completions/min_terminated_length": 292.0,
"entropy": 0.08585496246814728,
"epoch": 0.8868421052631579,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.0539562962949276,
"learning_rate": 1e-06,
"loss": 0.1175,
"num_tokens": 122741049.0,
"reward": 0.8183258771896362,
"reward_std": 0.15719908475875854,
"rewards/progression_diversity/mean": -0.0033535552211105824,
"rewards/progression_diversity/std": 0.04218889772891998,
"rewards/symbolic_reward_accuracy/mean": 0.90625,
"rewards/symbolic_reward_accuracy/std": 0.29176566004753113,
"rewards/symbolic_reward_partial_score/mean": 0.9290364384651184,
"rewards/symbolic_reward_partial_score/std": 0.24505099654197693,
"rewards/tag_count_reward/mean": -0.041015625,
"rewards/tag_count_reward/std": 0.19852031767368317,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0127696990966797,
"sampling/importance_sampling_ratio/min": 7.183120487752603e-06,
"sampling/sampling_logp_difference/max": 11.84377670288086,
"sampling/sampling_logp_difference/mean": 0.045910563319921494,
"step": 337
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.08782363682985306,
"epoch": 0.8894736842105263,
"grad_norm": 0.026008745655417442,
"learning_rate": 1e-06,
"loss": 0.0413,
"step": 338
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.09032351523637772,
"epoch": 0.8921052631578947,
"grad_norm": 0.05031033605337143,
"learning_rate": 1e-06,
"loss": 0.1113,
"step": 339
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.0947401411831379,
"epoch": 0.8947368421052632,
"grad_norm": 0.020250199362635612,
"learning_rate": 1e-06,
"loss": 0.0466,
"step": 340
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7096.0,
"completions/mean_length": 1569.30078125,
"completions/mean_terminated_length": 967.0772094726562,
"completions/min_length": 305.0,
"completions/min_terminated_length": 305.0,
"entropy": 0.09787581861019135,
"epoch": 0.8973684210526316,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.03366163745522499,
"learning_rate": 1e-06,
"loss": 0.0513,
"num_tokens": 123942163.0,
"reward": 0.8242777585983276,
"reward_std": 0.17191796004772186,
"rewards/progression_diversity/mean": -0.0038672885857522488,
"rewards/progression_diversity/std": 0.05004805698990822,
"rewards/symbolic_reward_accuracy/mean": 0.9140625,
"rewards/symbolic_reward_accuracy/std": 0.28054583072662354,
"rewards/symbolic_reward_partial_score/mean": 0.9326171875,
"rewards/symbolic_reward_partial_score/std": 0.23826108872890472,
"rewards/tag_count_reward/mean": -0.0390625,
"rewards/tag_count_reward/std": 0.1939331740140915,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0147292613983154,
"sampling/importance_sampling_ratio/min": 4.3840016587637365e-05,
"sampling/sampling_logp_difference/max": 10.034963607788086,
"sampling/sampling_logp_difference/mean": 0.05117640271782875,
"step": 341
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.09365754574537277,
"epoch": 0.9,
"grad_norm": 0.029882870614528656,
"learning_rate": 1e-06,
"loss": 0.0784,
"step": 342
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.09345110133290291,
"epoch": 0.9026315789473685,
"grad_norm": 0.03513422608375549,
"learning_rate": 1e-06,
"loss": 0.0559,
"step": 343
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.09143765270709991,
"epoch": 0.9052631578947369,
"grad_norm": 0.024734172970056534,
"learning_rate": 1e-06,
"loss": 0.0746,
"step": 344
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6936.0,
"completions/mean_length": 1666.32421875,
"completions/mean_terminated_length": 1005.5305786132812,
"completions/min_length": 316.0,
"completions/min_terminated_length": 316.0,
"entropy": 0.09469415619969368,
"epoch": 0.9078947368421053,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.03798873722553253,
"learning_rate": 1e-06,
"loss": 0.0727,
"num_tokens": 125223993.0,
"reward": 0.810152530670166,
"reward_std": 0.19300349056720734,
"rewards/progression_diversity/mean": -0.00526084192097187,
"rewards/progression_diversity/std": 0.053911034017801285,
"rewards/symbolic_reward_accuracy/mean": 0.8984375,
"rewards/symbolic_reward_accuracy/std": 0.30236753821372986,
"rewards/symbolic_reward_partial_score/mean": 0.9181314706802368,
"rewards/symbolic_reward_partial_score/std": 0.26354485750198364,
"rewards/tag_count_reward/mean": -0.04296875,
"rewards/tag_count_reward/std": 0.2029850035905838,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0131090879440308,
"sampling/importance_sampling_ratio/min": 1.7146856407634914e-05,
"sampling/sampling_logp_difference/max": 10.973695755004883,
"sampling/sampling_logp_difference/mean": 0.04458559304475784,
"step": 345
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.08893976360559464,
"epoch": 0.9105263157894737,
"grad_norm": 0.03366508707404137,
"learning_rate": 1e-06,
"loss": 0.1022,
"step": 346
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.09565364941954613,
"epoch": 0.9131578947368421,
"grad_norm": 0.01510405819863081,
"learning_rate": 1e-06,
"loss": 0.0209,
"step": 347
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.09264839813113213,
"epoch": 0.9157894736842105,
"grad_norm": 0.015571820549666882,
"learning_rate": 1e-06,
"loss": 0.1362,
"step": 348
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8325.0,
"completions/mean_length": 1417.078125,
"completions/mean_terminated_length": 934.274169921875,
"completions/min_length": 290.0,
"completions/min_terminated_length": 290.0,
"entropy": 0.09942467883229256,
"epoch": 0.9184210526315789,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.046768125146627426,
"learning_rate": 1e-06,
"loss": 0.0793,
"num_tokens": 126347329.0,
"reward": 0.8201934099197388,
"reward_std": 0.18495051562786102,
"rewards/progression_diversity/mean": -0.0021503996104002,
"rewards/progression_diversity/std": 0.03275972977280617,
"rewards/symbolic_reward_accuracy/mean": 0.908203125,
"rewards/symbolic_reward_accuracy/std": 0.289021372795105,
"rewards/symbolic_reward_partial_score/mean": 0.9280598759651184,
"rewards/symbolic_reward_partial_score/std": 0.24548624455928802,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0158005952835083,
"sampling/importance_sampling_ratio/min": 1.7636549500821275e-06,
"sampling/sampling_logp_difference/max": 13.248122215270996,
"sampling/sampling_logp_difference/mean": 0.05201897770166397,
"step": 349
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.09871027618646622,
"epoch": 0.9210526315789473,
"grad_norm": 0.015367010608315468,
"learning_rate": 1e-06,
"loss": 0.0738,
"step": 350
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.09856873750686646,
"epoch": 0.9236842105263158,
"grad_norm": 0.02123577892780304,
"learning_rate": 1e-06,
"loss": 0.0286,
"step": 351
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.09770620614290237,
"epoch": 0.9263157894736842,
"grad_norm": 0.02374398149549961,
"learning_rate": 1e-06,
"loss": 0.072,
"step": 352
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.080078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12225.0,
"completions/mean_length": 2309.302734375,
"completions/mean_terminated_length": 1084.1168212890625,
"completions/min_length": 315.0,
"completions/min_terminated_length": 315.0,
"entropy": 0.09652888029813766,
"epoch": 0.9289473684210526,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.02025141753256321,
"learning_rate": 1e-06,
"loss": 0.0611,
"num_tokens": 127938332.0,
"reward": 0.7528777122497559,
"reward_std": 0.21120837330818176,
"rewards/progression_diversity/mean": -0.0052050938829779625,
"rewards/progression_diversity/std": 0.045792821794748306,
"rewards/symbolic_reward_accuracy/mean": 0.8359375,
"rewards/symbolic_reward_accuracy/std": 0.37069445848464966,
"rewards/symbolic_reward_partial_score/mean": 0.8645833134651184,
"rewards/symbolic_reward_partial_score/std": 0.32852864265441895,
"rewards/tag_count_reward/mean": -0.080078125,
"rewards/tag_count_reward/std": 0.271679550409317,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0146894454956055,
"sampling/importance_sampling_ratio/min": 3.716075696047483e-07,
"sampling/sampling_logp_difference/max": 14.805427551269531,
"sampling/sampling_logp_difference/mean": 0.050460390746593475,
"step": 353
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.09619471430778503,
"epoch": 0.9315789473684211,
"grad_norm": 0.033115409314632416,
"learning_rate": 1e-06,
"loss": 0.0835,
"step": 354
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.09719686955213547,
"epoch": 0.9342105263157895,
"grad_norm": 0.03017679788172245,
"learning_rate": 1e-06,
"loss": 0.111,
"step": 355
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.09859620407223701,
"epoch": 0.9368421052631579,
"grad_norm": 0.02311846613883972,
"learning_rate": 1e-06,
"loss": 0.0881,
"step": 356
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14940.0,
"completions/mean_length": 1620.341796875,
"completions/mean_terminated_length": 894.2601928710938,
"completions/min_length": 263.0,
"completions/min_terminated_length": 263.0,
"entropy": 0.10532217100262642,
"epoch": 0.9394736842105263,
"frac_reward_zero_std": 0.46875,
"grad_norm": 0.028915587812662125,
"learning_rate": 1e-06,
"loss": 0.039,
"num_tokens": 129171915.0,
"reward": 0.8127701282501221,
"reward_std": 0.16265010833740234,
"rewards/progression_diversity/mean": -0.0022891198750585318,
"rewards/progression_diversity/std": 0.022422684356570244,
"rewards/symbolic_reward_accuracy/mean": 0.90234375,
"rewards/symbolic_reward_accuracy/std": 0.29713961482048035,
"rewards/symbolic_reward_partial_score/mean": 0.9202474355697632,
"rewards/symbolic_reward_partial_score/std": 0.2603859007358551,
"rewards/tag_count_reward/mean": -0.046875,
"rewards/tag_count_reward/std": 0.21157780289649963,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0165646076202393,
"sampling/importance_sampling_ratio/min": 5.568129235709263e-17,
"sampling/sampling_logp_difference/max": 37.42688751220703,
"sampling/sampling_logp_difference/mean": 0.053267642855644226,
"step": 357
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.10380205139517784,
"epoch": 0.9421052631578948,
"grad_norm": 0.039858750998973846,
"learning_rate": 1e-06,
"loss": 0.1027,
"step": 358
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.10536672174930573,
"epoch": 0.9447368421052632,
"grad_norm": 0.011492653749883175,
"learning_rate": 1e-06,
"loss": 0.0268,
"step": 359
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.10100537538528442,
"epoch": 0.9473684210526315,
"grad_norm": 0.04140660539269447,
"learning_rate": 1e-06,
"loss": 0.0916,
"step": 360
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5861.0,
"completions/mean_length": 1051.1953125,
"completions/mean_terminated_length": 869.3834228515625,
"completions/min_length": 258.0,
"completions/min_terminated_length": 258.0,
"entropy": 0.11518026143312454,
"epoch": 0.95,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.029431862756609917,
"learning_rate": 1e-06,
"loss": -0.0004,
"num_tokens": 130096271.0,
"reward": 0.8326168060302734,
"reward_std": 0.1631331741809845,
"rewards/progression_diversity/mean": -4.066104520461522e-05,
"rewards/progression_diversity/std": 0.000920054386369884,
"rewards/symbolic_reward_accuracy/mean": 0.91796875,
"rewards/symbolic_reward_accuracy/std": 0.2746807038784027,
"rewards/symbolic_reward_partial_score/mean": 0.943359375,
"rewards/symbolic_reward_partial_score/std": 0.21264995634555817,
"rewards/tag_count_reward/mean": -0.01171875,
"rewards/tag_count_reward/std": 0.10772226005792618,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.019619345664978,
"sampling/importance_sampling_ratio/min": 5.807437264593318e-05,
"sampling/sampling_logp_difference/max": 9.753786087036133,
"sampling/sampling_logp_difference/mean": 0.06247701495885849,
"step": 361
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.11114299297332764,
"epoch": 0.9526315789473684,
"grad_norm": 0.045638490468263626,
"learning_rate": 1e-06,
"loss": 0.058,
"step": 362
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.10937144979834557,
"epoch": 0.9552631578947368,
"grad_norm": 0.013277344405651093,
"learning_rate": 1e-06,
"loss": 0.0497,
"step": 363
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.11187266558408737,
"epoch": 0.9578947368421052,
"grad_norm": 0.013971379958093166,
"learning_rate": 1e-06,
"loss": 0.0041,
"step": 364
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017578125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6367.0,
"completions/mean_length": 1118.857421875,
"completions/mean_terminated_length": 845.7236328125,
"completions/min_length": 250.0,
"completions/min_terminated_length": 250.0,
"entropy": 0.10873980447649956,
"epoch": 0.9605263157894737,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.03028622269630432,
"learning_rate": 1e-06,
"loss": 0.0177,
"num_tokens": 131061606.0,
"reward": 0.8580078482627869,
"reward_std": 0.12159307301044464,
"rewards/progression_diversity/mean": 0.0,
"rewards/progression_diversity/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.951171875,
"rewards/symbolic_reward_accuracy/std": 0.2157193273305893,
"rewards/symbolic_reward_partial_score/mean": 0.9635416269302368,
"rewards/symbolic_reward_partial_score/std": 0.17610274255275726,
"rewards/tag_count_reward/mean": -0.017578125,
"rewards/tag_count_reward/std": 0.13154059648513794,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0192532539367676,
"sampling/importance_sampling_ratio/min": 3.5043937196554964e-10,
"sampling/sampling_logp_difference/max": 21.771833419799805,
"sampling/sampling_logp_difference/mean": 0.062029507011175156,
"step": 365
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.11343821510672569,
"epoch": 0.9631578947368421,
"grad_norm": 0.006393097806721926,
"learning_rate": 1e-06,
"loss": 0.0393,
"step": 366
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.10997330024838448,
"epoch": 0.9657894736842105,
"grad_norm": 0.04175656661391258,
"learning_rate": 1e-06,
"loss": 0.0229,
"step": 367
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.11030839011073112,
"epoch": 0.968421052631579,
"grad_norm": 0.009800048545002937,
"learning_rate": 1e-06,
"loss": 0.0279,
"step": 368
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.072265625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8872.0,
"completions/mean_length": 2057.54296875,
"completions/mean_terminated_length": 941.5873413085938,
"completions/min_length": 294.0,
"completions/min_terminated_length": 294.0,
"entropy": 0.10226080939173698,
"epoch": 0.9710526315789474,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.0458550862967968,
"learning_rate": 1e-06,
"loss": 0.1033,
"num_tokens": 132521884.0,
"reward": 0.7442119121551514,
"reward_std": 0.21518045663833618,
"rewards/progression_diversity/mean": -0.0026392091531306505,
"rewards/progression_diversity/std": 0.02361941523849964,
"rewards/symbolic_reward_accuracy/mean": 0.822265625,
"rewards/symbolic_reward_accuracy/std": 0.3826628625392914,
"rewards/symbolic_reward_partial_score/mean": 0.8590494394302368,
"rewards/symbolic_reward_partial_score/std": 0.33246058225631714,
"rewards/tag_count_reward/mean": -0.068359375,
"rewards/tag_count_reward/std": 0.25260838866233826,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0173622369766235,
"sampling/importance_sampling_ratio/min": 5.933624197496101e-05,
"sampling/sampling_logp_difference/max": 9.732290267944336,
"sampling/sampling_logp_difference/mean": 0.05440949648618698,
"step": 369
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.10158997401595116,
"epoch": 0.9736842105263158,
"grad_norm": 0.038810838013887405,
"learning_rate": 1e-06,
"loss": 0.0925,
"step": 370
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.10655439645051956,
"epoch": 0.9763157894736842,
"grad_norm": 0.019916830584406853,
"learning_rate": 1e-06,
"loss": 0.0455,
"step": 371
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.10742965340614319,
"epoch": 0.9789473684210527,
"grad_norm": 0.01620314083993435,
"learning_rate": 1e-06,
"loss": 0.0595,
"step": 372
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7506.0,
"completions/mean_length": 2036.57421875,
"completions/mean_terminated_length": 820.690673828125,
"completions/min_length": 281.0,
"completions/min_terminated_length": 281.0,
"entropy": 0.1056484505534172,
"epoch": 0.9815789473684211,
"frac_reward_zero_std": 0.46875,
"grad_norm": 0.028215283527970314,
"learning_rate": 1e-06,
"loss": 0.0996,
"num_tokens": 133962402.0,
"reward": 0.7769756317138672,
"reward_std": 0.14990827441215515,
"rewards/progression_diversity/mean": -0.002632810501381755,
"rewards/progression_diversity/std": 0.029632003977894783,
"rewards/symbolic_reward_accuracy/mean": 0.857421875,
"rewards/symbolic_reward_accuracy/std": 0.3499840497970581,
"rewards/symbolic_reward_partial_score/mean": 0.9005533456802368,
"rewards/symbolic_reward_partial_score/std": 0.28045758605003357,
"rewards/tag_count_reward/mean": -0.076171875,
"rewards/tag_count_reward/std": 0.26553234457969666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0167696475982666,
"sampling/importance_sampling_ratio/min": 2.0920060388868178e-14,
"sampling/sampling_logp_difference/max": 31.49806785583496,
"sampling/sampling_logp_difference/mean": 0.052999142557382584,
"step": 373
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.10324260219931602,
"epoch": 0.9842105263157894,
"grad_norm": 0.03451311215758324,
"learning_rate": 1e-06,
"loss": 0.0725,
"step": 374
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.10772082582116127,
"epoch": 0.9868421052631579,
"grad_norm": 0.01771300472319126,
"learning_rate": 1e-06,
"loss": 0.0278,
"step": 375
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.10192153975367546,
"epoch": 0.9894736842105263,
"grad_norm": 0.05016673728823662,
"learning_rate": 1e-06,
"loss": 0.1221,
"step": 376
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14736.0,
"completions/mean_length": 1309.953125,
"completions/mean_terminated_length": 792.2586059570312,
"completions/min_length": 276.0,
"completions/min_terminated_length": 276.0,
"entropy": 0.11359592527151108,
"epoch": 0.9921052631578947,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.02216515503823757,
"learning_rate": 1e-06,
"loss": 0.0448,
"num_tokens": 135040714.0,
"reward": 0.8058879971504211,
"reward_std": 0.18076883256435394,
"rewards/progression_diversity/mean": -0.0020200079306960106,
"rewards/progression_diversity/std": 0.026723647490143776,
"rewards/symbolic_reward_accuracy/mean": 0.890625,
"rewards/symbolic_reward_accuracy/std": 0.31241437792778015,
"rewards/symbolic_reward_partial_score/mean": 0.9161783456802368,
"rewards/symbolic_reward_partial_score/std": 0.2642694115638733,
"rewards/tag_count_reward/mean": -0.033203125,
"rewards/tag_count_reward/std": 0.17934183776378632,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0174283981323242,
"sampling/importance_sampling_ratio/min": 1.079955563909607e-05,
"sampling/sampling_logp_difference/max": 11.436005592346191,
"sampling/sampling_logp_difference/mean": 0.057007431983947754,
"step": 377
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.1086348332464695,
"epoch": 0.9947368421052631,
"grad_norm": 0.022858861833810806,
"learning_rate": 1e-06,
"loss": 0.0659,
"step": 378
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.11348432675004005,
"epoch": 0.9973684210526316,
"grad_norm": 0.01786152645945549,
"learning_rate": 1e-06,
"loss": 0.0745,
"step": 379
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.1098298691213131,
"epoch": 1.0,
"grad_norm": 0.014867125079035759,
"learning_rate": 1e-06,
"loss": 0.118,
"step": 380
},
{
"epoch": 1.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.015380859375,
"eval_completions/max_length": 13116.78125,
"eval_completions/max_terminated_length": 3658.3125,
"eval_completions/mean_length": 877.506103515625,
"eval_completions/mean_terminated_length": 635.8593416213989,
"eval_completions/min_length": 274.03125,
"eval_completions/min_terminated_length": 274.03125,
"eval_entropy": 0.11584724555723369,
"eval_frac_reward_zero_std": 0.50390625,
"eval_loss": 0.018385088071227074,
"eval_num_tokens": 135040714.0,
"eval_reward": 0.8390934336930513,
"eval_reward_std": 0.13885579153429717,
"eval_rewards/progression_diversity/mean": -0.0005718442766351473,
"eval_rewards/progression_diversity/std": 0.005817418514197925,
"eval_rewards/symbolic_reward_accuracy/mean": 0.928466796875,
"eval_rewards/symbolic_reward_accuracy/std": 0.24404766922816634,
"eval_rewards/symbolic_reward_partial_score/mean": 0.9451904278248549,
"eval_rewards/symbolic_reward_partial_score/std": 0.19986428710399196,
"eval_rewards/tag_count_reward/mean": -0.015380859375,
"eval_rewards/tag_count_reward/std": 0.10135847562924027,
"eval_runtime": 3216.5627,
"eval_samples_per_second": 0.078,
"eval_sampling/importance_sampling_ratio/max": 2.0,
"eval_sampling/importance_sampling_ratio/mean": 1.020699668675661,
"eval_sampling/importance_sampling_ratio/min": 0.0016229211131763817,
"eval_sampling/sampling_logp_difference/max": 8.347711205482483,
"eval_sampling/sampling_logp_difference/mean": 0.06724433228373528,
"eval_steps_per_second": 0.001,
"step": 380
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5424.0,
"completions/mean_length": 1000.6953125,
"completions/mean_terminated_length": 756.5159301757812,
"completions/min_length": 258.0,
"completions/min_terminated_length": 258.0,
"entropy": 0.11362230032682419,
"epoch": 1.0026315789473683,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.04116159304976463,
"learning_rate": 1e-06,
"loss": 0.0408,
"num_tokens": 135975406.0,
"reward": 0.8353489637374878,
"reward_std": 0.15797469019889832,
"rewards/progression_diversity/mean": -0.0002593405661173165,
"rewards/progression_diversity/std": 0.005868207197636366,
"rewards/symbolic_reward_accuracy/mean": 0.921875,
"rewards/symbolic_reward_accuracy/std": 0.26863065361976624,
"rewards/symbolic_reward_partial_score/mean": 0.9459635019302368,
"rewards/symbolic_reward_partial_score/std": 0.20843014121055603,
"rewards/tag_count_reward/mean": -0.015625,
"rewards/tag_count_reward/std": 0.12414088100194931,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0185670852661133,
"sampling/importance_sampling_ratio/min": 5.377536217565648e-05,
"sampling/sampling_logp_difference/max": 9.830695152282715,
"sampling/sampling_logp_difference/mean": 0.060036540031433105,
"step": 381
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.11580048501491547,
"epoch": 1.0052631578947369,
"grad_norm": 0.039675887674093246,
"learning_rate": 1e-06,
"loss": 0.0341,
"step": 382
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.11673459783196449,
"epoch": 1.0078947368421052,
"grad_norm": 0.022269433364272118,
"learning_rate": 1e-06,
"loss": 0.0286,
"step": 383
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1328125,
"entropy": 0.10907693952322006,
"epoch": 1.0105263157894737,
"grad_norm": 0.01744413562119007,
"learning_rate": 1e-06,
"loss": 0.0408,
"step": 384
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7503.0,
"completions/mean_length": 1365.107421875,
"completions/mean_terminated_length": 817.8603515625,
"completions/min_length": 265.0,
"completions/min_terminated_length": 265.0,
"entropy": 0.11244688928127289,
"epoch": 1.013157894736842,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.04786130413413048,
"learning_rate": 1e-06,
"loss": 0.0923,
"num_tokens": 137089157.0,
"reward": 0.7905688285827637,
"reward_std": 0.1710084080696106,
"rewards/progression_diversity/mean": -0.0007375068962574005,
"rewards/progression_diversity/std": 0.00985936913639307,
"rewards/symbolic_reward_accuracy/mean": 0.861328125,
"rewards/symbolic_reward_accuracy/std": 0.34594178199768066,
"rewards/symbolic_reward_partial_score/mean": 0.92431640625,
"rewards/symbolic_reward_partial_score/std": 0.22582882642745972,
"rewards/tag_count_reward/mean": -0.03515625,
"rewards/tag_count_reward/std": 0.1843547374010086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0192362070083618,
"sampling/importance_sampling_ratio/min": 2.148686326108873e-05,
"sampling/sampling_logp_difference/max": 10.748068809509277,
"sampling/sampling_logp_difference/mean": 0.05868230015039444,
"step": 385
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.11678306013345718,
"epoch": 1.0157894736842106,
"grad_norm": 0.017300132662057877,
"learning_rate": 1e-06,
"loss": 0.0402,
"step": 386
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.11940395832061768,
"epoch": 1.018421052631579,
"grad_norm": 0.01427427213639021,
"learning_rate": 1e-06,
"loss": 0.0228,
"step": 387
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.11587749421596527,
"epoch": 1.0210526315789474,
"grad_norm": 0.03753054514527321,
"learning_rate": 1e-06,
"loss": 0.0614,
"step": 388
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10616.0,
"completions/mean_length": 1142.22265625,
"completions/mean_terminated_length": 745.1422729492188,
"completions/min_length": 274.0,
"completions/min_terminated_length": 274.0,
"entropy": 0.1159932017326355,
"epoch": 1.0236842105263158,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.03958291932940483,
"learning_rate": 1e-06,
"loss": 0.0994,
"num_tokens": 138065431.0,
"reward": 0.8293313384056091,
"reward_std": 0.15930168330669403,
"rewards/progression_diversity/mean": -0.001442349050194025,
"rewards/progression_diversity/std": 0.016170360147953033,
"rewards/symbolic_reward_accuracy/mean": 0.916015625,
"rewards/symbolic_reward_accuracy/std": 0.2776356339454651,
"rewards/symbolic_reward_partial_score/mean": 0.94091796875,
"rewards/symbolic_reward_partial_score/std": 0.2196962684392929,
"rewards/tag_count_reward/mean": -0.025390625,
"rewards/tag_count_reward/std": 0.15746226906776428,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0209177732467651,
"sampling/importance_sampling_ratio/min": 8.514979299434344e-07,
"sampling/sampling_logp_difference/max": 13.976268768310547,
"sampling/sampling_logp_difference/mean": 0.06216352805495262,
"step": 389
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.12261833995580673,
"epoch": 1.0263157894736843,
"grad_norm": 0.025751272216439247,
"learning_rate": 1e-06,
"loss": 0.0612,
"step": 390
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.12619221955537796,
"epoch": 1.0289473684210526,
"grad_norm": 0.020393963903188705,
"learning_rate": 1e-06,
"loss": 0.0126,
"step": 391
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.1277061551809311,
"epoch": 1.0315789473684212,
"grad_norm": 0.02457948587834835,
"learning_rate": 1e-06,
"loss": 0.0283,
"step": 392
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017578125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6283.0,
"completions/mean_length": 1035.68359375,
"completions/mean_terminated_length": 761.0615844726562,
"completions/min_length": 292.0,
"completions/min_terminated_length": 292.0,
"entropy": 0.12381928041577339,
"epoch": 1.0342105263157895,
"frac_reward_zero_std": 0.53125,
"grad_norm": 0.0314381942152977,
"learning_rate": 1e-06,
"loss": 0.0236,
"num_tokens": 138999509.0,
"reward": 0.8199691772460938,
"reward_std": 0.13415905833244324,
"rewards/progression_diversity/mean": -0.00015412273933179677,
"rewards/progression_diversity/std": 0.0034873993135988712,
"rewards/symbolic_reward_accuracy/mean": 0.896484375,
"rewards/symbolic_reward_accuracy/std": 0.30492907762527466,
"rewards/symbolic_reward_partial_score/mean": 0.9461262822151184,
"rewards/symbolic_reward_partial_score/std": 0.19489480555057526,
"rewards/tag_count_reward/mean": -0.017578125,
"rewards/tag_count_reward/std": 0.13154059648513794,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0213356018066406,
"sampling/importance_sampling_ratio/min": 5.044097406425863e-07,
"sampling/sampling_logp_difference/max": 14.499876976013184,
"sampling/sampling_logp_difference/mean": 0.06386812776327133,
"step": 393
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.11941304057836533,
"epoch": 1.0368421052631578,
"grad_norm": 0.007009011693298817,
"learning_rate": 1e-06,
"loss": 0.0269,
"step": 394
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.1252107173204422,
"epoch": 1.0394736842105263,
"grad_norm": 0.018644485622644424,
"learning_rate": 1e-06,
"loss": 0.0083,
"step": 395
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.03125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.12022094428539276,
"epoch": 1.0421052631578946,
"grad_norm": 0.008900588378310204,
"learning_rate": 1e-06,
"loss": 0.0657,
"step": 396
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.052734375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5305.0,
"completions/mean_length": 1517.716796875,
"completions/mean_terminated_length": 690.1093139648438,
"completions/min_length": 272.0,
"completions/min_terminated_length": 272.0,
"entropy": 0.11808853596448898,
"epoch": 1.0447368421052632,
"frac_reward_zero_std": 0.53125,
"grad_norm": 0.033308856189250946,
"learning_rate": 1e-06,
"loss": 0.0487,
"num_tokens": 140177540.0,
"reward": 0.805396318435669,
"reward_std": 0.14244824647903442,
"rewards/progression_diversity/mean": -0.002365510445088148,
"rewards/progression_diversity/std": 0.032508544623851776,
"rewards/symbolic_reward_accuracy/mean": 0.890625,
"rewards/symbolic_reward_accuracy/std": 0.31241437792778015,
"rewards/symbolic_reward_partial_score/mean": 0.9210612177848816,
"rewards/symbolic_reward_partial_score/std": 0.248762309551239,
"rewards/tag_count_reward/mean": -0.052734375,
"rewards/tag_count_reward/std": 0.22372129559516907,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0201090574264526,
"sampling/importance_sampling_ratio/min": 1.2532452728919452e-06,
"sampling/sampling_logp_difference/max": 13.589774131774902,
"sampling/sampling_logp_difference/mean": 0.05805087089538574,
"step": 397
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.11611740663647652,
"epoch": 1.0473684210526315,
"grad_norm": 0.013962333090603352,
"learning_rate": 1e-06,
"loss": 0.0597,
"step": 398
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.1225820817053318,
"epoch": 1.05,
"grad_norm": 0.023140504956245422,
"learning_rate": 1e-06,
"loss": 0.0526,
"step": 399
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.12173304334282875,
"epoch": 1.0526315789473684,
"grad_norm": 0.014247185550630093,
"learning_rate": 1e-06,
"loss": 0.0829,
"step": 400
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9002.0,
"completions/mean_length": 1278.06640625,
"completions/mean_terminated_length": 664.0040283203125,
"completions/min_length": 259.0,
"completions/min_terminated_length": 259.0,
"entropy": 0.12073200196027756,
"epoch": 1.055263157894737,
"frac_reward_zero_std": 0.53125,
"grad_norm": 0.030312160030007362,
"learning_rate": 1e-06,
"loss": 0.0684,
"num_tokens": 141226214.0,
"reward": 0.8194763660430908,
"reward_std": 0.1385163962841034,
"rewards/progression_diversity/mean": -0.0006138992612250149,
"rewards/progression_diversity/std": 0.008086828514933586,
"rewards/symbolic_reward_accuracy/mean": 0.904296875,
"rewards/symbolic_reward_accuracy/std": 0.2944713830947876,
"rewards/symbolic_reward_partial_score/mean": 0.93603515625,
"rewards/symbolic_reward_partial_score/std": 0.22104382514953613,
"rewards/tag_count_reward/mean": -0.0390625,
"rewards/tag_count_reward/std": 0.1939331740140915,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0207290649414062,
"sampling/importance_sampling_ratio/min": 0.00014015565102454275,
"sampling/sampling_logp_difference/max": 8.872756958007812,
"sampling/sampling_logp_difference/mean": 0.06024109199643135,
"step": 401
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.1248621977865696,
"epoch": 1.0578947368421052,
"grad_norm": 0.010959668084979057,
"learning_rate": 1e-06,
"loss": 0.0659,
"step": 402
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.11883337050676346,
"epoch": 1.0605263157894738,
"grad_norm": 0.024657705798745155,
"learning_rate": 1e-06,
"loss": 0.0655,
"step": 403
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.11919617652893066,
"epoch": 1.063157894736842,
"grad_norm": 0.01765652373433113,
"learning_rate": 1e-06,
"loss": 0.0271,
"step": 404
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.041015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7545.0,
"completions/mean_length": 1404.330078125,
"completions/mean_terminated_length": 763.6517944335938,
"completions/min_length": 296.0,
"completions/min_terminated_length": 296.0,
"entropy": 0.11879817768931389,
"epoch": 1.0657894736842106,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.012826770544052124,
"learning_rate": 1e-06,
"loss": 0.0739,
"num_tokens": 142359887.0,
"reward": 0.8211853504180908,
"reward_std": 0.14381209015846252,
"rewards/progression_diversity/mean": -0.000610048184171319,
"rewards/progression_diversity/std": 0.00695717241615057,
"rewards/symbolic_reward_accuracy/mean": 0.908203125,
"rewards/symbolic_reward_accuracy/std": 0.289021372795105,
"rewards/symbolic_reward_partial_score/mean": 0.9345703125,
"rewards/symbolic_reward_partial_score/std": 0.22934241592884064,
"rewards/tag_count_reward/mean": -0.041015625,
"rewards/tag_count_reward/std": 0.19852031767368317,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0197585821151733,
"sampling/importance_sampling_ratio/min": 4.271742568562331e-07,
"sampling/sampling_logp_difference/max": 14.6660737991333,
"sampling/sampling_logp_difference/mean": 0.058111920952796936,
"step": 405
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.112503781914711,
"epoch": 1.068421052631579,
"grad_norm": 0.022235997021198273,
"learning_rate": 1e-06,
"loss": 0.0677,
"step": 406
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.11570753157138824,
"epoch": 1.0710526315789473,
"grad_norm": 0.01149500161409378,
"learning_rate": 1e-06,
"loss": 0.0595,
"step": 407
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.11613816022872925,
"epoch": 1.0736842105263158,
"grad_norm": 0.015948958694934845,
"learning_rate": 1e-06,
"loss": 0.0366,
"step": 408
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.044921875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4484.0,
"completions/mean_length": 1437.271484375,
"completions/mean_terminated_length": 734.255615234375,
"completions/min_length": 238.0,
"completions/min_terminated_length": 238.0,
"entropy": 0.11242419108748436,
"epoch": 1.0763157894736841,
"frac_reward_zero_std": 0.46875,
"grad_norm": 0.037394747138023376,
"learning_rate": 1e-06,
"loss": 0.0298,
"num_tokens": 143501082.0,
"reward": 0.8034499883651733,
"reward_std": 0.14993996918201447,
"rewards/progression_diversity/mean": -0.0016794700641185045,
"rewards/progression_diversity/std": 0.01779768615961075,
"rewards/symbolic_reward_accuracy/mean": 0.884765625,
"rewards/symbolic_reward_accuracy/std": 0.3196168541908264,
"rewards/symbolic_reward_partial_score/mean": 0.9236653447151184,
"rewards/symbolic_reward_partial_score/std": 0.24423635005950928,
"rewards/tag_count_reward/mean": -0.044921875,
"rewards/tag_count_reward/std": 0.20733514428138733,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0196541547775269,
"sampling/importance_sampling_ratio/min": 5.551847425522283e-05,
"sampling/sampling_logp_difference/max": 9.798794746398926,
"sampling/sampling_logp_difference/mean": 0.05961894989013672,
"step": 409
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.10686653479933739,
"epoch": 1.0789473684210527,
"grad_norm": 0.03459122031927109,
"learning_rate": 1e-06,
"loss": 0.1115,
"step": 410
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0546875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1328125,
"entropy": 0.10253704711794853,
"epoch": 1.081578947368421,
"grad_norm": 0.017048373818397522,
"learning_rate": 1e-06,
"loss": 0.0259,
"step": 411
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0546875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.109375,
"entropy": 0.0990813598036766,
"epoch": 1.0842105263157895,
"grad_norm": 0.03026195615530014,
"learning_rate": 1e-06,
"loss": 0.0124,
"step": 412
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4737.0,
"completions/mean_length": 1033.18359375,
"completions/mean_terminated_length": 664.7640380859375,
"completions/min_length": 269.0,
"completions/min_terminated_length": 269.0,
"entropy": 0.10128960758447647,
"epoch": 1.0868421052631578,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.029587438330054283,
"learning_rate": 1e-06,
"loss": 0.0237,
"num_tokens": 144440376.0,
"reward": 0.8033533096313477,
"reward_std": 0.17567187547683716,
"rewards/progression_diversity/mean": -0.0015859488630667329,
"rewards/progression_diversity/std": 0.02551618218421936,
"rewards/symbolic_reward_accuracy/mean": 0.876953125,
"rewards/symbolic_reward_accuracy/std": 0.32881227135658264,
"rewards/symbolic_reward_partial_score/mean": 0.9318033456802368,
"rewards/symbolic_reward_partial_score/std": 0.2252691388130188,
"rewards/tag_count_reward/mean": -0.0234375,
"rewards/tag_count_reward/std": 0.15143637359142303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0161960124969482,
"sampling/importance_sampling_ratio/min": 1.3736066648561973e-06,
"sampling/sampling_logp_difference/max": 13.49807071685791,
"sampling/sampling_logp_difference/mean": 0.05791211128234863,
"step": 413
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.09996145218610764,
"epoch": 1.0894736842105264,
"grad_norm": 0.013834556564688683,
"learning_rate": 1e-06,
"loss": 0.0171,
"step": 414
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.10281230881810188,
"epoch": 1.0921052631578947,
"grad_norm": 0.02543732523918152,
"learning_rate": 1e-06,
"loss": 0.0327,
"step": 415
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.09636327251791954,
"epoch": 1.0947368421052632,
"grad_norm": 0.008221838623285294,
"learning_rate": 1e-06,
"loss": 0.0774,
"step": 416
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5506.0,
"completions/mean_length": 788.818359375,
"completions/mean_terminated_length": 666.0216674804688,
"completions/min_length": 252.0,
"completions/min_terminated_length": 252.0,
"entropy": 0.1014927513897419,
"epoch": 1.0973684210526315,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.01914730854332447,
"learning_rate": 1e-06,
"loss": 0.0277,
"num_tokens": 145242203.0,
"reward": 0.8517071008682251,
"reward_std": 0.12221311032772064,
"rewards/progression_diversity/mean": -0.00019061629427596927,
"rewards/progression_diversity/std": 0.004313154611736536,
"rewards/symbolic_reward_accuracy/mean": 0.9375,
"rewards/symbolic_reward_accuracy/std": 0.2422981858253479,
"rewards/symbolic_reward_partial_score/mean": 0.9666340947151184,
"rewards/symbolic_reward_partial_score/std": 0.15885646641254425,
"rewards/tag_count_reward/mean": -0.0078125,
"rewards/tag_count_reward/std": 0.08812850713729858,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.017521858215332,
"sampling/importance_sampling_ratio/min": 4.664206699089846e-06,
"sampling/sampling_logp_difference/max": 12.275592803955078,
"sampling/sampling_logp_difference/mean": 0.060040805488824844,
"step": 417
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.10259632021188736,
"epoch": 1.1,
"grad_norm": 0.03230629488825798,
"learning_rate": 1e-06,
"loss": 0.0411,
"step": 418
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.10374432802200317,
"epoch": 1.1026315789473684,
"grad_norm": 0.029180046170949936,
"learning_rate": 1e-06,
"loss": 0.0381,
"step": 419
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.10562139376997948,
"epoch": 1.1052631578947367,
"grad_norm": 0.005970039404928684,
"learning_rate": 1e-06,
"loss": 0.0082,
"step": 420
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7054.0,
"completions/mean_length": 1033.38671875,
"completions/mean_terminated_length": 727.59765625,
"completions/min_length": 244.0,
"completions/min_terminated_length": 244.0,
"entropy": 0.10337032377719879,
"epoch": 1.1078947368421053,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.016906937584280968,
"learning_rate": 1e-06,
"loss": 0.014,
"num_tokens": 146154753.0,
"reward": 0.8127256631851196,
"reward_std": 0.16144107282161713,
"rewards/progression_diversity/mean": -0.0018486212939023972,
"rewards/progression_diversity/std": 0.030813097953796387,
"rewards/symbolic_reward_accuracy/mean": 0.896484375,
"rewards/symbolic_reward_accuracy/std": 0.30492907762527466,
"rewards/symbolic_reward_partial_score/mean": 0.9226887822151184,
"rewards/symbolic_reward_partial_score/std": 0.25257766246795654,
"rewards/tag_count_reward/mean": -0.01953125,
"rewards/tag_count_reward/std": 0.1385180652141571,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0161592960357666,
"sampling/importance_sampling_ratio/min": 5.162914021639153e-06,
"sampling/sampling_logp_difference/max": 12.174009323120117,
"sampling/sampling_logp_difference/mean": 0.05756688863039017,
"step": 421
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.09895607084035873,
"epoch": 1.1105263157894736,
"grad_norm": 0.04962952807545662,
"learning_rate": 1e-06,
"loss": 0.0424,
"step": 422
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.10057874768972397,
"epoch": 1.1131578947368421,
"grad_norm": 0.015979913994669914,
"learning_rate": 1e-06,
"loss": 0.0597,
"step": 423
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.10110774263739586,
"epoch": 1.1157894736842104,
"grad_norm": 0.0167376846075058,
"learning_rate": 1e-06,
"loss": 0.0414,
"step": 424
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5384.0,
"completions/mean_length": 802.5234375,
"completions/mean_terminated_length": 741.419677734375,
"completions/min_length": 278.0,
"completions/min_terminated_length": 278.0,
"entropy": 0.09795539081096649,
"epoch": 1.118421052631579,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.01597507670521736,
"learning_rate": 1e-06,
"loss": 0.0091,
"num_tokens": 146970797.0,
"reward": 0.8327491879463196,
"reward_std": 0.15943855047225952,
"rewards/progression_diversity/mean": -0.0014538828982040286,
"rewards/progression_diversity/std": 0.03289761394262314,
"rewards/symbolic_reward_accuracy/mean": 0.912109375,
"rewards/symbolic_reward_accuracy/std": 0.2834126651287079,
"rewards/symbolic_reward_partial_score/mean": 0.9529622197151184,
"rewards/symbolic_reward_partial_score/std": 0.18036773800849915,
"rewards/tag_count_reward/mean": -0.00390625,
"rewards/tag_count_reward/std": 0.06243881583213806,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0156102180480957,
"sampling/importance_sampling_ratio/min": 2.9369621188379824e-05,
"sampling/sampling_logp_difference/max": 10.43554973602295,
"sampling/sampling_logp_difference/mean": 0.06037828326225281,
"step": 425
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.09897046536207199,
"epoch": 1.1210526315789473,
"grad_norm": 0.034385696053504944,
"learning_rate": 1e-06,
"loss": 0.0032,
"step": 426
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.09649962186813354,
"epoch": 1.1236842105263158,
"grad_norm": 0.01485302671790123,
"learning_rate": 1e-06,
"loss": 0.0068,
"step": 427
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.09620106220245361,
"epoch": 1.1263157894736842,
"grad_norm": 0.019307559356093407,
"learning_rate": 1e-06,
"loss": 0.0451,
"step": 428
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7232.0,
"completions/mean_length": 914.90234375,
"completions/mean_terminated_length": 731.474365234375,
"completions/min_length": 267.0,
"completions/min_terminated_length": 267.0,
"entropy": 0.09814492240548134,
"epoch": 1.1289473684210527,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.021051058545708656,
"learning_rate": 1e-06,
"loss": 0.0122,
"num_tokens": 147822843.0,
"reward": 0.8289062976837158,
"reward_std": 0.11690939962863922,
"rewards/progression_diversity/mean": 0.0,
"rewards/progression_diversity/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.91015625,
"rewards/symbolic_reward_accuracy/std": 0.2862374484539032,
"rewards/symbolic_reward_partial_score/mean": 0.9466145634651184,
"rewards/symbolic_reward_partial_score/std": 0.19831162691116333,
"rewards/tag_count_reward/mean": -0.01171875,
"rewards/tag_count_reward/std": 0.10772226005792618,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0156972408294678,
"sampling/importance_sampling_ratio/min": 4.255066414771136e-06,
"sampling/sampling_logp_difference/max": 12.367400169372559,
"sampling/sampling_logp_difference/mean": 0.06155911460518837,
"step": 429
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.10168719664216042,
"epoch": 1.131578947368421,
"grad_norm": 0.02653714269399643,
"learning_rate": 1e-06,
"loss": 0.0208,
"step": 430
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1171875,
"entropy": 0.09842254221439362,
"epoch": 1.1342105263157896,
"grad_norm": 0.010419169440865517,
"learning_rate": 1e-06,
"loss": 0.0184,
"step": 431
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.09445172548294067,
"epoch": 1.1368421052631579,
"grad_norm": 0.016496408730745316,
"learning_rate": 1e-06,
"loss": 0.0163,
"step": 432
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.013671875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4974.0,
"completions/mean_length": 910.11328125,
"completions/mean_terminated_length": 695.623779296875,
"completions/min_length": 241.0,
"completions/min_terminated_length": 241.0,
"entropy": 0.09750872850418091,
"epoch": 1.1394736842105262,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.01833958737552166,
"learning_rate": 1e-06,
"loss": 0.0055,
"num_tokens": 148674101.0,
"reward": 0.8388184309005737,
"reward_std": 0.0944415032863617,
"rewards/progression_diversity/mean": 0.0,
"rewards/progression_diversity/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.919921875,
"rewards/symbolic_reward_accuracy/std": 0.271679550409317,
"rewards/symbolic_reward_partial_score/mean": 0.9607747197151184,
"rewards/symbolic_reward_partial_score/std": 0.16794845461845398,
"rewards/tag_count_reward/mean": -0.013671875,
"rewards/tag_count_reward/std": 0.1162383034825325,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0153504610061646,
"sampling/importance_sampling_ratio/min": 1.6851483541913126e-09,
"sampling/sampling_logp_difference/max": 20.201412200927734,
"sampling/sampling_logp_difference/mean": 0.05861156806349754,
"step": 433
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.09329849109053612,
"epoch": 1.1421052631578947,
"grad_norm": 0.045422203838825226,
"learning_rate": 1e-06,
"loss": 0.0453,
"step": 434
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.09762432053685188,
"epoch": 1.1447368421052633,
"grad_norm": 0.014451306313276291,
"learning_rate": 1e-06,
"loss": 0.0188,
"step": 435
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.09433042258024216,
"epoch": 1.1473684210526316,
"grad_norm": 0.01592213101685047,
"learning_rate": 1e-06,
"loss": 0.0077,
"step": 436
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5906.0,
"completions/mean_length": 745.5859375,
"completions/mean_terminated_length": 684.2588500976562,
"completions/min_length": 275.0,
"completions/min_terminated_length": 275.0,
"entropy": 0.09743008762598038,
"epoch": 1.15,
"frac_reward_zero_std": 0.65625,
"grad_norm": 0.027467206120491028,
"learning_rate": 1e-06,
"loss": 0.0014,
"num_tokens": 149445473.0,
"reward": 0.8571289777755737,
"reward_std": 0.09262826293706894,
"rewards/progression_diversity/mean": 0.0,
"rewards/progression_diversity/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.9453125,
"rewards/symbolic_reward_accuracy/std": 0.2275916188955307,
"rewards/symbolic_reward_partial_score/mean": 0.9677734375,
"rewards/symbolic_reward_partial_score/std": 0.1515229344367981,
"rewards/tag_count_reward/mean": -0.00390625,
"rewards/tag_count_reward/std": 0.06243881583213806,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0158276557922363,
"sampling/importance_sampling_ratio/min": 4.494921683228018e-16,
"sampling/sampling_logp_difference/max": 35.33841323852539,
"sampling/sampling_logp_difference/mean": 0.06052025035023689,
"step": 437
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.09928737580776215,
"epoch": 1.1526315789473685,
"grad_norm": 0.026970867067575455,
"learning_rate": 1e-06,
"loss": 0.0147,
"step": 438
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1015625,
"entropy": 0.09795548766851425,
"epoch": 1.1552631578947368,
"grad_norm": 0.014579696580767632,
"learning_rate": 1e-06,
"loss": 0.0073,
"step": 439
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.10043598338961601,
"epoch": 1.1578947368421053,
"grad_norm": 0.00749570457264781,
"learning_rate": 1e-06,
"loss": 0.019,
"step": 440
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5394.0,
"completions/mean_length": 1239.98828125,
"completions/mean_terminated_length": 845.4548950195312,
"completions/min_length": 273.0,
"completions/min_terminated_length": 273.0,
"entropy": 0.0946054458618164,
"epoch": 1.1605263157894736,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.030099380761384964,
"learning_rate": 1e-06,
"loss": 0.0194,
"num_tokens": 150507195.0,
"reward": 0.7865234613418579,
"reward_std": 0.19104072451591492,
"rewards/progression_diversity/mean": 0.0,
"rewards/progression_diversity/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.85546875,
"rewards/symbolic_reward_accuracy/std": 0.35197147727012634,
"rewards/symbolic_reward_partial_score/mean": 0.9186197519302368,
"rewards/symbolic_reward_partial_score/std": 0.2408839911222458,
"rewards/tag_count_reward/mean": -0.0234375,
"rewards/tag_count_reward/std": 0.15143637359142303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0148797035217285,
"sampling/importance_sampling_ratio/min": 3.064930638174701e-07,
"sampling/sampling_logp_difference/max": 14.99807071685791,
"sampling/sampling_logp_difference/mean": 0.05635258927941322,
"step": 441
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.0940646082162857,
"epoch": 1.1631578947368422,
"grad_norm": 0.03301357850432396,
"learning_rate": 1e-06,
"loss": 0.0389,
"step": 442
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.09547659382224083,
"epoch": 1.1657894736842105,
"grad_norm": 0.023627353832125664,
"learning_rate": 1e-06,
"loss": 0.0095,
"step": 443
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.0967930480837822,
"epoch": 1.168421052631579,
"grad_norm": 0.028321361169219017,
"learning_rate": 1e-06,
"loss": 0.0502,
"step": 444
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.013671875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9650.0,
"completions/mean_length": 1090.6171875,
"completions/mean_terminated_length": 878.6297607421875,
"completions/min_length": 287.0,
"completions/min_terminated_length": 287.0,
"entropy": 0.10013857111334801,
"epoch": 1.1710526315789473,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.04974789917469025,
"learning_rate": 1e-06,
"loss": 0.0319,
"num_tokens": 151460055.0,
"reward": 0.800537109375,
"reward_std": 0.16888116300106049,
"rewards/progression_diversity/mean": 0.0,
"rewards/progression_diversity/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.873046875,
"rewards/symbolic_reward_accuracy/std": 0.33324605226516724,
"rewards/symbolic_reward_partial_score/mean": 0.92626953125,
"rewards/symbolic_reward_partial_score/std": 0.22773192822933197,
"rewards/tag_count_reward/mean": -0.01171875,
"rewards/tag_count_reward/std": 0.10772226005792618,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.016984224319458,
"sampling/importance_sampling_ratio/min": 2.9851989324924944e-07,
"sampling/sampling_logp_difference/max": 15.024429321289062,
"sampling/sampling_logp_difference/mean": 0.059770140796899796,
"step": 445
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.09959771484136581,
"epoch": 1.1736842105263159,
"grad_norm": 0.017701053991913795,
"learning_rate": 1e-06,
"loss": 0.0206,
"step": 446
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.10232729837298393,
"epoch": 1.1763157894736842,
"grad_norm": 0.02004496566951275,
"learning_rate": 1e-06,
"loss": 0.0277,
"step": 447
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.10072638839483261,
"epoch": 1.1789473684210527,
"grad_norm": 0.018607337027788162,
"learning_rate": 1e-06,
"loss": 0.031,
"step": 448
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6300.0,
"completions/mean_length": 931.9296875,
"completions/mean_terminated_length": 840.8566284179688,
"completions/min_length": 289.0,
"completions/min_terminated_length": 289.0,
"entropy": 0.10329998284578323,
"epoch": 1.181578947368421,
"frac_reward_zero_std": 0.46875,
"grad_norm": 0.039218734949827194,
"learning_rate": 1e-06,
"loss": 0.0256,
"num_tokens": 152344499.0,
"reward": 0.8318839073181152,
"reward_std": 0.14675593376159668,
"rewards/progression_diversity/mean": -8.955165685620159e-05,
"rewards/progression_diversity/std": 0.002026322763413191,
"rewards/symbolic_reward_accuracy/mean": 0.916015625,
"rewards/symbolic_reward_accuracy/std": 0.2776356339454651,
"rewards/symbolic_reward_partial_score/mean": 0.94287109375,
"rewards/symbolic_reward_partial_score/std": 0.2136351764202118,
"rewards/tag_count_reward/mean": -0.005859375,
"rewards/tag_count_reward/std": 0.07639661431312561,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0176771879196167,
"sampling/importance_sampling_ratio/min": 6.2567501117882784e-06,
"sampling/sampling_logp_difference/max": 11.981849670410156,
"sampling/sampling_logp_difference/mean": 0.06110313534736633,
"step": 449
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.10243717581033707,
"epoch": 1.1842105263157894,
"grad_norm": 0.02117745950818062,
"learning_rate": 1e-06,
"loss": 0.021,
"step": 450
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.10411274060606956,
"epoch": 1.186842105263158,
"grad_norm": 0.02108968421816826,
"learning_rate": 1e-06,
"loss": 0.0114,
"step": 451
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.10935872420668602,
"epoch": 1.1894736842105262,
"grad_norm": 0.007527265697717667,
"learning_rate": 1e-06,
"loss": 0.0099,
"step": 452
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7876.0,
"completions/max_terminated_length": 7876.0,
"completions/mean_length": 748.26171875,
"completions/mean_terminated_length": 748.26171875,
"completions/min_length": 286.0,
"completions/min_terminated_length": 286.0,
"entropy": 0.10911908373236656,
"epoch": 1.1921052631578948,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.029693739488720894,
"learning_rate": 1e-06,
"loss": 0.004,
"num_tokens": 153135929.0,
"reward": 0.8560059070587158,
"reward_std": 0.12228970229625702,
"rewards/progression_diversity/mean": 0.0,
"rewards/progression_diversity/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.943359375,
"rewards/symbolic_reward_accuracy/std": 0.23138070106506348,
"rewards/symbolic_reward_partial_score/mean": 0.9666340947151184,
"rewards/symbolic_reward_partial_score/std": 0.15885646641254425,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0188539028167725,
"sampling/importance_sampling_ratio/min": 1.5778656234033406e-05,
"sampling/sampling_logp_difference/max": 11.056852340698242,
"sampling/sampling_logp_difference/mean": 0.06543545424938202,
"step": 453
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.11097723618149757,
"epoch": 1.194736842105263,
"grad_norm": 0.019065558910369873,
"learning_rate": 1e-06,
"loss": -0.0018,
"step": 454
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.11221178621053696,
"epoch": 1.1973684210526316,
"grad_norm": 0.019534561783075333,
"learning_rate": 1e-06,
"loss": -0.0013,
"step": 455
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.10779605805873871,
"epoch": 1.2,
"grad_norm": 0.009235396981239319,
"learning_rate": 1e-06,
"loss": 0.0147,
"step": 456
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009765625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6720.0,
"completions/mean_length": 898.009765625,
"completions/mean_terminated_length": 745.2879638671875,
"completions/min_length": 262.0,
"completions/min_terminated_length": 262.0,
"entropy": 0.11065573990345001,
"epoch": 1.2026315789473685,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.012675322592258453,
"learning_rate": 1e-06,
"loss": 0.0092,
"num_tokens": 154012350.0,
"reward": 0.8206509947776794,
"reward_std": 0.14962440729141235,
"rewards/progression_diversity/mean": -0.00033113209065049887,
"rewards/progression_diversity/std": 0.0074926638044416904,
"rewards/symbolic_reward_accuracy/mean": 0.896484375,
"rewards/symbolic_reward_accuracy/std": 0.30492907762527466,
"rewards/symbolic_reward_partial_score/mean": 0.94580078125,
"rewards/symbolic_reward_partial_score/std": 0.1969550997018814,
"rewards/tag_count_reward/mean": -0.009765625,
"rewards/tag_count_reward/std": 0.09843364357948303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0194942951202393,
"sampling/importance_sampling_ratio/min": 1.5589863266995962e-07,
"sampling/sampling_logp_difference/max": 15.674059867858887,
"sampling/sampling_logp_difference/mean": 0.061966508626937866,
"step": 457
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3359375,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.10747477412223816,
"epoch": 1.2052631578947368,
"grad_norm": 0.023037495091557503,
"learning_rate": 1e-06,
"loss": 0.0034,
"step": 458
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.10901957005262375,
"epoch": 1.2078947368421054,
"grad_norm": 0.032999638468027115,
"learning_rate": 1e-06,
"loss": 0.039,
"step": 459
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.1066613681614399,
"epoch": 1.2105263157894737,
"grad_norm": 0.0317135825753212,
"learning_rate": 1e-06,
"loss": 0.0266,
"step": 460
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11462.0,
"completions/mean_length": 1336.01953125,
"completions/mean_terminated_length": 912.98388671875,
"completions/min_length": 248.0,
"completions/min_terminated_length": 248.0,
"entropy": 0.11170916259288788,
"epoch": 1.2131578947368422,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.027878640219569206,
"learning_rate": 1e-06,
"loss": 0.0273,
"num_tokens": 155114376.0,
"reward": 0.7967281341552734,
"reward_std": 0.12850847840309143,
"rewards/progression_diversity/mean": -4.386279761092737e-05,
"rewards/progression_diversity/std": 0.0009925017366185784,
"rewards/symbolic_reward_accuracy/mean": 0.875,
"rewards/symbolic_reward_accuracy/std": 0.3310423493385315,
"rewards/symbolic_reward_partial_score/mean": 0.9148762822151184,
"rewards/symbolic_reward_partial_score/std": 0.25462573766708374,
"rewards/tag_count_reward/mean": -0.02734375,
"rewards/tag_count_reward/std": 0.16324250400066376,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0198333263397217,
"sampling/importance_sampling_ratio/min": 3.876455139106838e-06,
"sampling/sampling_logp_difference/max": 12.460589408874512,
"sampling/sampling_logp_difference/mean": 0.06402070820331573,
"step": 461
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.11339271068572998,
"epoch": 1.2157894736842105,
"grad_norm": 0.02045866660773754,
"learning_rate": 1e-06,
"loss": 0.0186,
"step": 462
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.11221589148044586,
"epoch": 1.2184210526315788,
"grad_norm": 0.011421293020248413,
"learning_rate": 1e-06,
"loss": 0.0203,
"step": 463
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.11195248365402222,
"epoch": 1.2210526315789474,
"grad_norm": 0.016839897260069847,
"learning_rate": 1e-06,
"loss": 0.0322,
"step": 464
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7398.0,
"completions/mean_length": 822.09375,
"completions/mean_terminated_length": 761.0667114257812,
"completions/min_length": 267.0,
"completions/min_terminated_length": 267.0,
"entropy": 0.11619088798761368,
"epoch": 1.2236842105263157,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.045058924704790115,
"learning_rate": 1e-06,
"loss": 0.0145,
"num_tokens": 155939256.0,
"reward": 0.8314453363418579,
"reward_std": 0.14568641781806946,
"rewards/progression_diversity/mean": 0.0,
"rewards/progression_diversity/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.91015625,
"rewards/symbolic_reward_accuracy/std": 0.2862374484539032,
"rewards/symbolic_reward_partial_score/mean": 0.9524739384651184,
"rewards/symbolic_reward_partial_score/std": 0.1864290088415146,
"rewards/tag_count_reward/mean": -0.00390625,
"rewards/tag_count_reward/std": 0.06243881583213806,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0201449394226074,
"sampling/importance_sampling_ratio/min": 7.38695504765019e-08,
"sampling/sampling_logp_difference/max": 16.42096519470215,
"sampling/sampling_logp_difference/mean": 0.06559212505817413,
"step": 465
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.11323034018278122,
"epoch": 1.2263157894736842,
"grad_norm": 0.018060827627778053,
"learning_rate": 1e-06,
"loss": 0.0557,
"step": 466
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.11986911296844482,
"epoch": 1.2289473684210526,
"grad_norm": 0.006362342741340399,
"learning_rate": 1e-06,
"loss": 0.0211,
"step": 467
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.12106319144368172,
"epoch": 1.231578947368421,
"grad_norm": 0.0218115895986557,
"learning_rate": 1e-06,
"loss": 0.0058,
"step": 468
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12412.0,
"completions/mean_length": 1080.7890625,
"completions/mean_terminated_length": 899.328125,
"completions/min_length": 309.0,
"completions/min_terminated_length": 309.0,
"entropy": 0.12077518552541733,
"epoch": 1.2342105263157894,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.023888949304819107,
"learning_rate": 1e-06,
"loss": 0.0142,
"num_tokens": 156925804.0,
"reward": 0.8068230152130127,
"reward_std": 0.18419486284255981,
"rewards/progression_diversity/mean": -0.0012958223232999444,
"rewards/progression_diversity/std": 0.029321111738681793,
"rewards/symbolic_reward_accuracy/mean": 0.884765625,
"rewards/symbolic_reward_accuracy/std": 0.3196168541908264,
"rewards/symbolic_reward_partial_score/mean": 0.9231771230697632,
"rewards/symbolic_reward_partial_score/std": 0.24108576774597168,
"rewards/tag_count_reward/mean": -0.009765625,
"rewards/tag_count_reward/std": 0.09843364357948303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0196768045425415,
"sampling/importance_sampling_ratio/min": 8.88462352577335e-08,
"sampling/sampling_logp_difference/max": 16.236358642578125,
"sampling/sampling_logp_difference/mean": 0.0635080635547638,
"step": 469
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.11772556602954865,
"epoch": 1.236842105263158,
"grad_norm": 0.019387010484933853,
"learning_rate": 1e-06,
"loss": 0.0357,
"step": 470
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.11631882935762405,
"epoch": 1.2394736842105263,
"grad_norm": 0.02116510644555092,
"learning_rate": 1e-06,
"loss": 0.0205,
"step": 471
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.11524809151887894,
"epoch": 1.2421052631578948,
"grad_norm": 0.017463896423578262,
"learning_rate": 1e-06,
"loss": 0.0445,
"step": 472
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7361.0,
"completions/mean_length": 927.275390625,
"completions/mean_terminated_length": 805.5689086914062,
"completions/min_length": 269.0,
"completions/min_terminated_length": 269.0,
"entropy": 0.12217172235250473,
"epoch": 1.2447368421052631,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.030732987448573112,
"learning_rate": 1e-06,
"loss": 0.0108,
"num_tokens": 157807385.0,
"reward": 0.8150861263275146,
"reward_std": 0.15566140413284302,
"rewards/progression_diversity/mean": -0.0001790223177522421,
"rewards/progression_diversity/std": 0.004050812683999538,
"rewards/symbolic_reward_accuracy/mean": 0.888671875,
"rewards/symbolic_reward_accuracy/std": 0.31484565138816833,
"rewards/symbolic_reward_partial_score/mean": 0.9422200322151184,
"rewards/symbolic_reward_partial_score/std": 0.19841378927230835,
"rewards/tag_count_reward/mean": -0.0078125,
"rewards/tag_count_reward/std": 0.08812850713729858,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0209197998046875,
"sampling/importance_sampling_ratio/min": 4.93685820401879e-06,
"sampling/sampling_logp_difference/max": 12.218781471252441,
"sampling/sampling_logp_difference/mean": 0.06566841155290604,
"step": 473
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.12189845740795135,
"epoch": 1.2473684210526317,
"grad_norm": 0.023981690406799316,
"learning_rate": 1e-06,
"loss": 0.03,
"step": 474
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.1238529235124588,
"epoch": 1.25,
"grad_norm": 0.02387721836566925,
"learning_rate": 1e-06,
"loss": 0.0418,
"step": 475
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.12588384374976158,
"epoch": 1.2526315789473683,
"grad_norm": 0.021237516775727272,
"learning_rate": 1e-06,
"loss": 0.007,
"step": 476
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9261.0,
"completions/mean_length": 899.77734375,
"completions/mean_terminated_length": 716.1699829101562,
"completions/min_length": 251.0,
"completions/min_terminated_length": 251.0,
"entropy": 0.13065219670534134,
"epoch": 1.2552631578947369,
"frac_reward_zero_std": 0.65625,
"grad_norm": 0.0358455665409565,
"learning_rate": 1e-06,
"loss": 0.0222,
"num_tokens": 158659847.0,
"reward": 0.8351035714149475,
"reward_std": 0.10815407335758209,
"rewards/progression_diversity/mean": -0.0003900247684214264,
"rewards/progression_diversity/std": 0.008825253695249557,
"rewards/symbolic_reward_accuracy/mean": 0.91796875,
"rewards/symbolic_reward_accuracy/std": 0.2746807038784027,
"rewards/symbolic_reward_partial_score/mean": 0.9510090947151184,
"rewards/symbolic_reward_partial_score/std": 0.1945120096206665,
"rewards/tag_count_reward/mean": -0.009765625,
"rewards/tag_count_reward/std": 0.09843364357948303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0229235887527466,
"sampling/importance_sampling_ratio/min": 4.0339618863072246e-05,
"sampling/sampling_logp_difference/max": 10.118176460266113,
"sampling/sampling_logp_difference/mean": 0.06817199289798737,
"step": 477
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.140625,
"entropy": 0.13009684532880783,
"epoch": 1.2578947368421054,
"grad_norm": 0.017281439155340195,
"learning_rate": 1e-06,
"loss": 0.0275,
"step": 478
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1171875,
"entropy": 0.12655635178089142,
"epoch": 1.2605263157894737,
"grad_norm": 0.011967520229518414,
"learning_rate": 1e-06,
"loss": 0.0433,
"step": 479
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0390625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.109375,
"entropy": 0.13125859946012497,
"epoch": 1.263157894736842,
"grad_norm": 0.010364379733800888,
"learning_rate": 1e-06,
"loss": 0.0029,
"step": 480
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11268.0,
"completions/mean_length": 1359.4765625,
"completions/mean_terminated_length": 780.4381103515625,
"completions/min_length": 274.0,
"completions/min_terminated_length": 274.0,
"entropy": 0.123091921210289,
"epoch": 1.2657894736842106,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.03727136552333832,
"learning_rate": 1e-06,
"loss": 0.0505,
"num_tokens": 159759867.0,
"reward": 0.8022311925888062,
"reward_std": 0.1121816635131836,
"rewards/progression_diversity/mean": -0.001497793011367321,
"rewards/progression_diversity/std": 0.01563715748488903,
"rewards/symbolic_reward_accuracy/mean": 0.8828125,
"rewards/symbolic_reward_accuracy/std": 0.32195815443992615,
"rewards/symbolic_reward_partial_score/mean": 0.9208984375,
"rewards/symbolic_reward_partial_score/std": 0.2492290735244751,
"rewards/tag_count_reward/mean": -0.037109375,
"rewards/tag_count_reward/std": 0.18921469151973724,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0214684009552002,
"sampling/importance_sampling_ratio/min": 1.330551033934535e-11,
"sampling/sampling_logp_difference/max": 25.042842864990234,
"sampling/sampling_logp_difference/mean": 0.06472167372703552,
"step": 481
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.12072613090276718,
"epoch": 1.268421052631579,
"grad_norm": 0.022708555683493614,
"learning_rate": 1e-06,
"loss": 0.0388,
"step": 482
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.12625328078866005,
"epoch": 1.2710526315789474,
"grad_norm": 0.021841494366526604,
"learning_rate": 1e-06,
"loss": 0.0393,
"step": 483
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.12154950946569443,
"epoch": 1.2736842105263158,
"grad_norm": 0.009138455614447594,
"learning_rate": 1e-06,
"loss": 0.0245,
"step": 484
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4483.0,
"completions/mean_length": 1131.763671875,
"completions/mean_terminated_length": 671.4345703125,
"completions/min_length": 208.0,
"completions/min_terminated_length": 208.0,
"entropy": 0.12409983202815056,
"epoch": 1.2763157894736843,
"frac_reward_zero_std": 0.71875,
"grad_norm": 0.025687798857688904,
"learning_rate": 1e-06,
"loss": 0.0118,
"num_tokens": 160721602.0,
"reward": 0.8422271013259888,
"reward_std": 0.07236142456531525,
"rewards/progression_diversity/mean": -0.0009313340415246785,
"rewards/progression_diversity/std": 0.01315159909427166,
"rewards/symbolic_reward_accuracy/mean": 0.931640625,
"rewards/symbolic_reward_accuracy/std": 0.25260838866233826,
"rewards/symbolic_reward_partial_score/mean": 0.9539387822151184,
"rewards/symbolic_reward_partial_score/std": 0.19256454706192017,
"rewards/tag_count_reward/mean": -0.029296875,
"rewards/tag_count_reward/std": 0.16880230605602264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0208799839019775,
"sampling/importance_sampling_ratio/min": 0.000312736548949033,
"sampling/sampling_logp_difference/max": 8.070149421691895,
"sampling/sampling_logp_difference/mean": 0.06437121331691742,
"step": 485
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.078125,
"entropy": 0.11737996339797974,
"epoch": 1.2789473684210526,
"grad_norm": 0.024496793746948242,
"learning_rate": 1e-06,
"loss": 0.0558,
"step": 486
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.0625,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.078125,
"entropy": 0.11956480145454407,
"epoch": 1.2815789473684212,
"grad_norm": 0.010835636407136917,
"learning_rate": 1e-06,
"loss": 0.0307,
"step": 487
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1171875,
"entropy": 0.1189895048737526,
"epoch": 1.2842105263157895,
"grad_norm": 0.011718123219907284,
"learning_rate": 1e-06,
"loss": 0.0353,
"step": 488
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5296.0,
"completions/mean_length": 1337.478515625,
"completions/mean_terminated_length": 789.2247314453125,
"completions/min_length": 261.0,
"completions/min_terminated_length": 261.0,
"entropy": 0.11463934555649757,
"epoch": 1.2868421052631578,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.04130804166197777,
"learning_rate": 1e-06,
"loss": 0.0829,
"num_tokens": 161806871.0,
"reward": 0.7914862632751465,
"reward_std": 0.1913241147994995,
"rewards/progression_diversity/mean": -0.0017659981967881322,
"rewards/progression_diversity/std": 0.017526477575302124,
"rewards/symbolic_reward_accuracy/mean": 0.8671875,
"rewards/symbolic_reward_accuracy/std": 0.33970388770103455,
"rewards/symbolic_reward_partial_score/mean": 0.9156900644302368,
"rewards/symbolic_reward_partial_score/std": 0.2554028630256653,
"rewards/tag_count_reward/mean": -0.03515625,
"rewards/tag_count_reward/std": 0.1843547374010086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.018146276473999,
"sampling/importance_sampling_ratio/min": 1.5589847635055776e-07,
"sampling/sampling_logp_difference/max": 15.674060821533203,
"sampling/sampling_logp_difference/mean": 0.06120399013161659,
"step": 489
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.11538948118686676,
"epoch": 1.2894736842105263,
"grad_norm": 0.013879453763365746,
"learning_rate": 1e-06,
"loss": 0.0104,
"step": 490
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.11806956678628922,
"epoch": 1.2921052631578949,
"grad_norm": 0.012860557064414024,
"learning_rate": 1e-06,
"loss": 0.0499,
"step": 491
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.1125846691429615,
"epoch": 1.2947368421052632,
"grad_norm": 0.02000536397099495,
"learning_rate": 1e-06,
"loss": 0.0199,
"step": 492
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8706.0,
"completions/mean_length": 924.626953125,
"completions/mean_terminated_length": 741.3142700195312,
"completions/min_length": 243.0,
"completions/min_terminated_length": 243.0,
"entropy": 0.1223670057952404,
"epoch": 1.2973684210526315,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.014644470997154713,
"learning_rate": 1e-06,
"loss": 0.0171,
"num_tokens": 162687256.0,
"reward": 0.8528740406036377,
"reward_std": 0.12011945247650146,
"rewards/progression_diversity/mean": -0.000687220657709986,
"rewards/progression_diversity/std": 0.013531150296330452,
"rewards/symbolic_reward_accuracy/mean": 0.94140625,
"rewards/symbolic_reward_accuracy/std": 0.23509246110916138,
"rewards/symbolic_reward_partial_score/mean": 0.96337890625,
"rewards/symbolic_reward_partial_score/std": 0.17117686569690704,
"rewards/tag_count_reward/mean": -0.009765625,
"rewards/tag_count_reward/std": 0.09843364357948303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0194642543792725,
"sampling/importance_sampling_ratio/min": 2.044983940790368e-21,
"sampling/sampling_logp_difference/max": 47.63889694213867,
"sampling/sampling_logp_difference/mean": 0.06407853960990906,
"step": 493
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.1194252297282219,
"epoch": 1.3,
"grad_norm": 0.00824655033648014,
"learning_rate": 1e-06,
"loss": 0.033,
"step": 494
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.11969351023435593,
"epoch": 1.3026315789473684,
"grad_norm": 0.05345854163169861,
"learning_rate": 1e-06,
"loss": 0.0245,
"step": 495
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.11757103353738785,
"epoch": 1.305263157894737,
"grad_norm": 0.00989691074937582,
"learning_rate": 1e-06,
"loss": -0.0039,
"step": 496
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9763.0,
"completions/mean_length": 871.619140625,
"completions/mean_terminated_length": 780.1906127929688,
"completions/min_length": 267.0,
"completions/min_terminated_length": 267.0,
"entropy": 0.1206367239356041,
"epoch": 1.3078947368421052,
"frac_reward_zero_std": 0.46875,
"grad_norm": 0.048302192240953445,
"learning_rate": 1e-06,
"loss": 0.0334,
"num_tokens": 163535829.0,
"reward": 0.830558180809021,
"reward_std": 0.14636383950710297,
"rewards/progression_diversity/mean": -0.000826410308945924,
"rewards/progression_diversity/std": 0.01099415123462677,
"rewards/symbolic_reward_accuracy/mean": 0.91015625,
"rewards/symbolic_reward_accuracy/std": 0.2862374484539032,
"rewards/symbolic_reward_partial_score/mean": 0.9501953125,
"rewards/symbolic_reward_partial_score/std": 0.1905975341796875,
"rewards/tag_count_reward/mean": -0.005859375,
"rewards/tag_count_reward/std": 0.07639661431312561,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.020845651626587,
"sampling/importance_sampling_ratio/min": 1.0631980984499023e-07,
"sampling/sampling_logp_difference/max": 16.056814193725586,
"sampling/sampling_logp_difference/mean": 0.06594814360141754,
"step": 497
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.1199469231069088,
"epoch": 1.3105263157894738,
"grad_norm": 0.013368850573897362,
"learning_rate": 1e-06,
"loss": 0.0202,
"step": 498
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1328125,
"entropy": 0.12389621138572693,
"epoch": 1.313157894736842,
"grad_norm": 0.015475841239094734,
"learning_rate": 1e-06,
"loss": 0.0216,
"step": 499
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.140625,
"entropy": 0.12212049216032028,
"epoch": 1.3157894736842106,
"grad_norm": 0.018125230446457863,
"learning_rate": 1e-06,
"loss": 0.0084,
"step": 500
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6161.0,
"completions/mean_length": 1061.85546875,
"completions/mean_terminated_length": 725.4411010742188,
"completions/min_length": 251.0,
"completions/min_terminated_length": 251.0,
"entropy": 0.12253501266241074,
"epoch": 1.318421052631579,
"frac_reward_zero_std": 0.53125,
"grad_norm": 0.031911756843328476,
"learning_rate": 1e-06,
"loss": 0.0289,
"num_tokens": 164483307.0,
"reward": 0.8256685137748718,
"reward_std": 0.12521421909332275,
"rewards/progression_diversity/mean": -0.0015109577216207981,
"rewards/progression_diversity/std": 0.028086457401514053,
"rewards/symbolic_reward_accuracy/mean": 0.908203125,
"rewards/symbolic_reward_accuracy/std": 0.289021372795105,
"rewards/symbolic_reward_partial_score/mean": 0.9423828125,
"rewards/symbolic_reward_partial_score/std": 0.20720478892326355,
"rewards/tag_count_reward/mean": -0.01953125,
"rewards/tag_count_reward/std": 0.1385180652141571,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.019927740097046,
"sampling/importance_sampling_ratio/min": 7.929816092655528e-06,
"sampling/sampling_logp_difference/max": 11.744880676269531,
"sampling/sampling_logp_difference/mean": 0.06401927769184113,
"step": 501
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.11759228631854057,
"epoch": 1.3210526315789473,
"grad_norm": 0.14602532982826233,
"learning_rate": 1e-06,
"loss": 0.0659,
"step": 502
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.12064041569828987,
"epoch": 1.3236842105263158,
"grad_norm": 0.01236443966627121,
"learning_rate": 1e-06,
"loss": -0.0013,
"step": 503
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0390625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0859375,
"entropy": 0.12389854341745377,
"epoch": 1.3263157894736843,
"grad_norm": 0.015112272463738918,
"learning_rate": 1e-06,
"loss": -0.0001,
"step": 504
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6364.0,
"completions/mean_length": 1221.919921875,
"completions/mean_terminated_length": 826.9158325195312,
"completions/min_length": 300.0,
"completions/min_terminated_length": 300.0,
"entropy": 0.12016144394874573,
"epoch": 1.3289473684210527,
"frac_reward_zero_std": 0.46875,
"grad_norm": 0.019961973652243614,
"learning_rate": 1e-06,
"loss": 0.0151,
"num_tokens": 165545602.0,
"reward": 0.8171666860580444,
"reward_std": 0.16274571418762207,
"rewards/progression_diversity/mean": -0.0020828458946198225,
"rewards/progression_diversity/std": 0.021263638511300087,
"rewards/symbolic_reward_accuracy/mean": 0.904296875,
"rewards/symbolic_reward_accuracy/std": 0.2944713830947876,
"rewards/symbolic_reward_partial_score/mean": 0.9231770634651184,
"rewards/symbolic_reward_partial_score/std": 0.2538268566131592,
"rewards/tag_count_reward/mean": -0.0234375,
"rewards/tag_count_reward/std": 0.15143637359142303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0189927816390991,
"sampling/importance_sampling_ratio/min": 2.001787464678273e-07,
"sampling/sampling_logp_difference/max": 15.424055099487305,
"sampling/sampling_logp_difference/mean": 0.06025902181863785,
"step": 505
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.1265234798192978,
"epoch": 1.331578947368421,
"grad_norm": 0.014271781779825687,
"learning_rate": 1e-06,
"loss": 0.0147,
"step": 506
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.11439178138971329,
"epoch": 1.3342105263157895,
"grad_norm": 0.011770401149988174,
"learning_rate": 1e-06,
"loss": 0.0508,
"step": 507
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.11405204609036446,
"epoch": 1.3368421052631578,
"grad_norm": 0.013367000967264175,
"learning_rate": 1e-06,
"loss": 0.0757,
"step": 508
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6573.0,
"completions/mean_length": 1136.01171875,
"completions/mean_terminated_length": 832.2669677734375,
"completions/min_length": 249.0,
"completions/min_terminated_length": 249.0,
"entropy": 0.11793714761734009,
"epoch": 1.3394736842105264,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.030582962557673454,
"learning_rate": 1e-06,
"loss": 0.0369,
"num_tokens": 166528040.0,
"reward": 0.8212710022926331,
"reward_std": 0.13752031326293945,
"rewards/progression_diversity/mean": -0.0018108001677319407,
"rewards/progression_diversity/std": 0.021729890257120132,
"rewards/symbolic_reward_accuracy/mean": 0.904296875,
"rewards/symbolic_reward_accuracy/std": 0.2944713830947876,
"rewards/symbolic_reward_partial_score/mean": 0.935546875,
"rewards/symbolic_reward_partial_score/std": 0.22706012427806854,
"rewards/tag_count_reward/mean": -0.01953125,
"rewards/tag_count_reward/std": 0.1385180652141571,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0203887224197388,
"sampling/importance_sampling_ratio/min": 2.9788969186483882e-05,
"sampling/sampling_logp_difference/max": 10.421372413635254,
"sampling/sampling_logp_difference/mean": 0.06702390313148499,
"step": 509
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.11901584640145302,
"epoch": 1.3421052631578947,
"grad_norm": 0.0400007963180542,
"learning_rate": 1e-06,
"loss": 0.0253,
"step": 510
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.11924078688025475,
"epoch": 1.3447368421052632,
"grad_norm": 0.02401769533753395,
"learning_rate": 1e-06,
"loss": 0.0038,
"step": 511
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.12153895944356918,
"epoch": 1.3473684210526315,
"grad_norm": 0.0278632752597332,
"learning_rate": 1e-06,
"loss": 0.0202,
"step": 512
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6421.0,
"completions/mean_length": 1049.525390625,
"completions/mean_terminated_length": 712.84033203125,
"completions/min_length": 247.0,
"completions/min_terminated_length": 247.0,
"entropy": 0.1217559427022934,
"epoch": 1.35,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.01790975034236908,
"learning_rate": 1e-06,
"loss": 0.0213,
"num_tokens": 167447669.0,
"reward": 0.8032069802284241,
"reward_std": 0.15127655863761902,
"rewards/progression_diversity/mean": -0.0015715567860752344,
"rewards/progression_diversity/std": 0.016383837908506393,
"rewards/symbolic_reward_accuracy/mean": 0.87890625,
"rewards/symbolic_reward_accuracy/std": 0.3265552520751953,
"rewards/symbolic_reward_partial_score/mean": 0.9267578125,
"rewards/symbolic_reward_partial_score/std": 0.23392236232757568,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0203694105148315,
"sampling/importance_sampling_ratio/min": 7.359984124377661e-07,
"sampling/sampling_logp_difference/max": 14.122037887573242,
"sampling/sampling_logp_difference/mean": 0.06768319010734558,
"step": 513
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.1187475174665451,
"epoch": 1.3526315789473684,
"grad_norm": 0.009799002669751644,
"learning_rate": 1e-06,
"loss": 0.0417,
"step": 514
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.11943993717432022,
"epoch": 1.3552631578947367,
"grad_norm": 0.020848069339990616,
"learning_rate": 1e-06,
"loss": 0.0084,
"step": 515
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.12406942620873451,
"epoch": 1.3578947368421053,
"grad_norm": 0.019777188077569008,
"learning_rate": 1e-06,
"loss": -0.0014,
"step": 516
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11133.0,
"completions/mean_length": 1168.974609375,
"completions/mean_terminated_length": 834.9121704101562,
"completions/min_length": 293.0,
"completions/min_terminated_length": 293.0,
"entropy": 0.1224772073328495,
"epoch": 1.3605263157894738,
"frac_reward_zero_std": 0.46875,
"grad_norm": 0.021967625245451927,
"learning_rate": 1e-06,
"loss": 0.0293,
"num_tokens": 168440488.0,
"reward": 0.8120394945144653,
"reward_std": 0.16349293291568756,
"rewards/progression_diversity/mean": -0.0021093892864882946,
"rewards/progression_diversity/std": 0.01896587759256363,
"rewards/symbolic_reward_accuracy/mean": 0.89453125,
"rewards/symbolic_reward_accuracy/std": 0.3074568510055542,
"rewards/symbolic_reward_partial_score/mean": 0.9249674081802368,
"rewards/symbolic_reward_partial_score/std": 0.24613560736179352,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0184623003005981,
"sampling/importance_sampling_ratio/min": 1.3710611028727726e-06,
"sampling/sampling_logp_difference/max": 13.49992561340332,
"sampling/sampling_logp_difference/mean": 0.06050651893019676,
"step": 517
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.11167807132005692,
"epoch": 1.3631578947368421,
"grad_norm": 0.03911877050995827,
"learning_rate": 1e-06,
"loss": 0.083,
"step": 518
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.11478234827518463,
"epoch": 1.3657894736842104,
"grad_norm": 0.020847424864768982,
"learning_rate": 1e-06,
"loss": 0.0529,
"step": 519
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.11543101072311401,
"epoch": 1.368421052631579,
"grad_norm": 0.018829762935638428,
"learning_rate": 1e-06,
"loss": 0.0307,
"step": 520
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6398.0,
"completions/mean_length": 1195.33984375,
"completions/mean_terminated_length": 736.9295654296875,
"completions/min_length": 257.0,
"completions/min_terminated_length": 257.0,
"entropy": 0.12062574923038483,
"epoch": 1.3710526315789473,
"frac_reward_zero_std": 0.53125,
"grad_norm": 0.014021288603544235,
"learning_rate": 1e-06,
"loss": 0.0427,
"num_tokens": 169450134.0,
"reward": 0.7832722663879395,
"reward_std": 0.13144497573375702,
"rewards/progression_diversity/mean": -0.002853758167475462,
"rewards/progression_diversity/std": 0.023874642327427864,
"rewards/symbolic_reward_accuracy/mean": 0.853515625,
"rewards/symbolic_reward_accuracy/std": 0.35393697023391724,
"rewards/symbolic_reward_partial_score/mean": 0.9137369394302368,
"rewards/symbolic_reward_partial_score/std": 0.2524980306625366,
"rewards/tag_count_reward/mean": -0.029296875,
"rewards/tag_count_reward/std": 0.16880230605602264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0197254419326782,
"sampling/importance_sampling_ratio/min": 1.406861771338595e-16,
"sampling/sampling_logp_difference/max": 36.5,
"sampling/sampling_logp_difference/mean": 0.064872145652771,
"step": 521
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.12126883119344711,
"epoch": 1.3736842105263158,
"grad_norm": 0.018665987998247147,
"learning_rate": 1e-06,
"loss": 0.0304,
"step": 522
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.1257994957268238,
"epoch": 1.3763157894736842,
"grad_norm": 0.028348246589303017,
"learning_rate": 1e-06,
"loss": 0.0122,
"step": 523
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.12155702710151672,
"epoch": 1.3789473684210527,
"grad_norm": 0.027928415685892105,
"learning_rate": 1e-06,
"loss": 0.0405,
"step": 524
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15986.0,
"completions/mean_length": 1216.671875,
"completions/mean_terminated_length": 758.9053955078125,
"completions/min_length": 241.0,
"completions/min_terminated_length": 241.0,
"entropy": 0.12585609033703804,
"epoch": 1.381578947368421,
"frac_reward_zero_std": 0.53125,
"grad_norm": 0.020609745755791664,
"learning_rate": 1e-06,
"loss": 0.0281,
"num_tokens": 170497390.0,
"reward": 0.8209607601165771,
"reward_std": 0.13462477922439575,
"rewards/progression_diversity/mean": -0.003541269339621067,
"rewards/progression_diversity/std": 0.028825946152210236,
"rewards/symbolic_reward_accuracy/mean": 0.904296875,
"rewards/symbolic_reward_accuracy/std": 0.2944713830947876,
"rewards/symbolic_reward_partial_score/mean": 0.9378255009651184,
"rewards/symbolic_reward_partial_score/std": 0.22360049188137054,
"rewards/tag_count_reward/mean": -0.029296875,
"rewards/tag_count_reward/std": 0.16880230605602264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0208656787872314,
"sampling/importance_sampling_ratio/min": 2.124352249666117e-07,
"sampling/sampling_logp_difference/max": 15.364628791809082,
"sampling/sampling_logp_difference/mean": 0.06426618993282318,
"step": 525
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.1270783357322216,
"epoch": 1.3842105263157896,
"grad_norm": 0.015202553011476994,
"learning_rate": 1e-06,
"loss": 0.0751,
"step": 526
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.13192099332809448,
"epoch": 1.3868421052631579,
"grad_norm": 0.02699258364737034,
"learning_rate": 1e-06,
"loss": 0.022,
"step": 527
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.13195763528347015,
"epoch": 1.3894736842105262,
"grad_norm": 0.02506939321756363,
"learning_rate": 1e-06,
"loss": 0.0573,
"step": 528
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.044921875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5820.0,
"completions/mean_length": 1409.9921875,
"completions/mean_terminated_length": 705.6932373046875,
"completions/min_length": 264.0,
"completions/min_terminated_length": 264.0,
"entropy": 0.132181815803051,
"epoch": 1.3921052631578947,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.015631964430212975,
"learning_rate": 1e-06,
"loss": 0.0212,
"num_tokens": 171635146.0,
"reward": 0.7929407954216003,
"reward_std": 0.14727577567100525,
"rewards/progression_diversity/mean": -0.007682529743760824,
"rewards/progression_diversity/std": 0.043747011572122574,
"rewards/symbolic_reward_accuracy/mean": 0.8671875,
"rewards/symbolic_reward_accuracy/std": 0.33970388770103455,
"rewards/symbolic_reward_partial_score/mean": 0.9239909052848816,
"rewards/symbolic_reward_partial_score/std": 0.23265688121318817,
"rewards/tag_count_reward/mean": -0.044921875,
"rewards/tag_count_reward/std": 0.20733514428138733,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0227715969085693,
"sampling/importance_sampling_ratio/min": 1.4806555270752142e-07,
"sampling/sampling_logp_difference/max": 15.725610733032227,
"sampling/sampling_logp_difference/mean": 0.06556607782840729,
"step": 529
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.12692655250430107,
"epoch": 1.3947368421052633,
"grad_norm": 0.013282880187034607,
"learning_rate": 1e-06,
"loss": 0.0761,
"step": 530
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.13208907842636108,
"epoch": 1.3973684210526316,
"grad_norm": 0.01377064548432827,
"learning_rate": 1e-06,
"loss": 0.0366,
"step": 531
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.13234106451272964,
"epoch": 1.4,
"grad_norm": 0.020579254254698753,
"learning_rate": 1e-06,
"loss": 0.0348,
"step": 532
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4966.0,
"completions/mean_length": 1032.94921875,
"completions/mean_terminated_length": 695.9002075195312,
"completions/min_length": 243.0,
"completions/min_terminated_length": 243.0,
"entropy": 0.12633076682686806,
"epoch": 1.4026315789473685,
"frac_reward_zero_std": 0.53125,
"grad_norm": 0.039995189756155014,
"learning_rate": 1e-06,
"loss": 0.101,
"num_tokens": 172575824.0,
"reward": 0.8291662931442261,
"reward_std": 0.12353667616844177,
"rewards/progression_diversity/mean": -0.003291813191026449,
"rewards/progression_diversity/std": 0.029986631125211716,
"rewards/symbolic_reward_accuracy/mean": 0.912109375,
"rewards/symbolic_reward_accuracy/std": 0.2834126651287079,
"rewards/symbolic_reward_partial_score/mean": 0.9469400644302368,
"rewards/symbolic_reward_partial_score/std": 0.20057909190654755,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0211803913116455,
"sampling/importance_sampling_ratio/min": 8.730064109840896e-06,
"sampling/sampling_logp_difference/max": 11.648737907409668,
"sampling/sampling_logp_difference/mean": 0.0656655952334404,
"step": 533
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.13137900829315186,
"epoch": 1.4052631578947368,
"grad_norm": 0.018619263544678688,
"learning_rate": 1e-06,
"loss": -0.0028,
"step": 534
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.1252904236316681,
"epoch": 1.4078947368421053,
"grad_norm": 0.015142420306801796,
"learning_rate": 1e-06,
"loss": 0.0256,
"step": 535
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.13380999118089676,
"epoch": 1.4105263157894736,
"grad_norm": 0.0105243269354105,
"learning_rate": 1e-06,
"loss": 0.0065,
"step": 536
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3757.0,
"completions/mean_length": 1088.859375,
"completions/mean_terminated_length": 658.87548828125,
"completions/min_length": 244.0,
"completions/min_terminated_length": 244.0,
"entropy": 0.13350340723991394,
"epoch": 1.4131578947368422,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.03917808085680008,
"learning_rate": 1e-06,
"loss": 0.0387,
"num_tokens": 173525096.0,
"reward": 0.8361979722976685,
"reward_std": 0.11943801492452621,
"rewards/progression_diversity/mean": -0.0032509397715330124,
"rewards/progression_diversity/std": 0.02727237343788147,
"rewards/symbolic_reward_accuracy/mean": 0.923828125,
"rewards/symbolic_reward_accuracy/std": 0.26553234457969666,
"rewards/symbolic_reward_partial_score/mean": 0.9488931894302368,
"rewards/symbolic_reward_partial_score/std": 0.2026350498199463,
"rewards/tag_count_reward/mean": -0.02734375,
"rewards/tag_count_reward/std": 0.16324250400066376,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0211467742919922,
"sampling/importance_sampling_ratio/min": 5.162648449186236e-06,
"sampling/sampling_logp_difference/max": 12.174060821533203,
"sampling/sampling_logp_difference/mean": 0.0664573684334755,
"step": 537
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.13789378851652145,
"epoch": 1.4157894736842105,
"grad_norm": 0.008778770454227924,
"learning_rate": 1e-06,
"loss": 0.0109,
"step": 538
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1328125,
"entropy": 0.13081741333007812,
"epoch": 1.418421052631579,
"grad_norm": 0.019565429538488388,
"learning_rate": 1e-06,
"loss": 0.0334,
"step": 539
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.140625,
"entropy": 0.12705697864294052,
"epoch": 1.4210526315789473,
"grad_norm": 0.020435446873307228,
"learning_rate": 1e-06,
"loss": 0.0642,
"step": 540
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7683.0,
"completions/mean_length": 1142.486328125,
"completions/mean_terminated_length": 619.0404052734375,
"completions/min_length": 226.0,
"completions/min_terminated_length": 226.0,
"entropy": 0.12402678281068802,
"epoch": 1.4236842105263157,
"frac_reward_zero_std": 0.65625,
"grad_norm": 0.022379858419299126,
"learning_rate": 1e-06,
"loss": 0.0262,
"num_tokens": 174504833.0,
"reward": 0.8272143006324768,
"reward_std": 0.06737488508224487,
"rewards/progression_diversity/mean": -0.0031863353215157986,
"rewards/progression_diversity/std": 0.028556160628795624,
"rewards/symbolic_reward_accuracy/mean": 0.91015625,
"rewards/symbolic_reward_accuracy/std": 0.2862374484539032,
"rewards/symbolic_reward_partial_score/mean": 0.9475911259651184,
"rewards/symbolic_reward_partial_score/std": 0.2016286551952362,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0205636024475098,
"sampling/importance_sampling_ratio/min": 2.388037658853444e-15,
"sampling/sampling_logp_difference/max": 33.668304443359375,
"sampling/sampling_logp_difference/mean": 0.06742183864116669,
"step": 541
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.13175051659345627,
"epoch": 1.4263157894736842,
"grad_norm": 0.009576407261192799,
"learning_rate": 1e-06,
"loss": 0.0465,
"step": 542
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.125,
"entropy": 0.12570127844810486,
"epoch": 1.4289473684210527,
"grad_norm": 0.011829288676381111,
"learning_rate": 1e-06,
"loss": 0.0597,
"step": 543
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.1273176595568657,
"epoch": 1.431578947368421,
"grad_norm": 0.01299221906810999,
"learning_rate": 1e-06,
"loss": 0.0145,
"step": 544
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16202.0,
"completions/mean_length": 961.04296875,
"completions/mean_terminated_length": 622.4151611328125,
"completions/min_length": 240.0,
"completions/min_terminated_length": 240.0,
"entropy": 0.11435278132557869,
"epoch": 1.4342105263157894,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.030455466359853745,
"learning_rate": 1e-06,
"loss": 0.0736,
"num_tokens": 175375831.0,
"reward": 0.8315600156784058,
"reward_std": 0.10956034064292908,
"rewards/progression_diversity/mean": -0.00318007729947567,
"rewards/progression_diversity/std": 0.026876848191022873,
"rewards/symbolic_reward_accuracy/mean": 0.9140625,
"rewards/symbolic_reward_accuracy/std": 0.28054583072662354,
"rewards/symbolic_reward_partial_score/mean": 0.9510090947151184,
"rewards/symbolic_reward_partial_score/std": 0.19548769295215607,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0186996459960938,
"sampling/importance_sampling_ratio/min": 6.467719504144043e-05,
"sampling/sampling_logp_difference/max": 9.646101951599121,
"sampling/sampling_logp_difference/mean": 0.06452836096286774,
"step": 545
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.1248774528503418,
"epoch": 1.436842105263158,
"grad_norm": 0.030380593612790108,
"learning_rate": 1e-06,
"loss": 0.0411,
"step": 546
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.11489719897508621,
"epoch": 1.4394736842105262,
"grad_norm": 0.013684880919754505,
"learning_rate": 1e-06,
"loss": 0.031,
"step": 547
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1328125,
"entropy": 0.12723349034786224,
"epoch": 1.4421052631578948,
"grad_norm": 0.005004690028727055,
"learning_rate": 1e-06,
"loss": 0.0033,
"step": 548
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7127.0,
"completions/mean_length": 1883.12109375,
"completions/mean_terminated_length": 720.6033325195312,
"completions/min_length": 232.0,
"completions/min_terminated_length": 232.0,
"entropy": 0.11534935608506203,
"epoch": 1.444736842105263,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.0357719361782074,
"learning_rate": 1e-06,
"loss": 0.064,
"num_tokens": 176770005.0,
"reward": 0.7715877294540405,
"reward_std": 0.17539432644844055,
"rewards/progression_diversity/mean": -0.009200896136462688,
"rewards/progression_diversity/std": 0.044748030602931976,
"rewards/symbolic_reward_accuracy/mean": 0.849609375,
"rewards/symbolic_reward_accuracy/std": 0.35780346393585205,
"rewards/symbolic_reward_partial_score/mean": 0.8977864980697632,
"rewards/symbolic_reward_partial_score/std": 0.2720419764518738,
"rewards/tag_count_reward/mean": -0.07421875,
"rewards/tag_count_reward/std": 0.2623828947544098,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.017478108406067,
"sampling/importance_sampling_ratio/min": 2.1775213099317625e-05,
"sampling/sampling_logp_difference/max": 10.73473834991455,
"sampling/sampling_logp_difference/mean": 0.05594378709793091,
"step": 549
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.11893630772829056,
"epoch": 1.4473684210526316,
"grad_norm": 0.020856492221355438,
"learning_rate": 1e-06,
"loss": 0.0626,
"step": 550
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.10961251333355904,
"epoch": 1.45,
"grad_norm": 0.032535944133996964,
"learning_rate": 1e-06,
"loss": 0.1122,
"step": 551
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.12273216247558594,
"epoch": 1.4526315789473685,
"grad_norm": 0.015543154440820217,
"learning_rate": 1e-06,
"loss": 0.0734,
"step": 552
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3752.0,
"completions/mean_length": 1222.4609375,
"completions/mean_terminated_length": 638.1419677734375,
"completions/min_length": 241.0,
"completions/min_terminated_length": 241.0,
"entropy": 0.12245375290513039,
"epoch": 1.4552631578947368,
"frac_reward_zero_std": 0.65625,
"grad_norm": 0.010475813411176205,
"learning_rate": 1e-06,
"loss": 0.0767,
"num_tokens": 177793537.0,
"reward": 0.8133152723312378,
"reward_std": 0.0920829027891159,
"rewards/progression_diversity/mean": -0.006364539731293917,
"rewards/progression_diversity/std": 0.04115993529558182,
"rewards/symbolic_reward_accuracy/mean": 0.89453125,
"rewards/symbolic_reward_accuracy/std": 0.3074568510055542,
"rewards/symbolic_reward_partial_score/mean": 0.9345703125,
"rewards/symbolic_reward_partial_score/std": 0.2200278490781784,
"rewards/tag_count_reward/mean": -0.037109375,
"rewards/tag_count_reward/std": 0.18921469151973724,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0192129611968994,
"sampling/importance_sampling_ratio/min": 5.405944246937577e-13,
"sampling/sampling_logp_difference/max": 28.24610710144043,
"sampling/sampling_logp_difference/mean": 0.06582315266132355,
"step": 553
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.078125,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0859375,
"entropy": 0.1257517747581005,
"epoch": 1.4578947368421051,
"grad_norm": 0.010606672614812851,
"learning_rate": 1e-06,
"loss": 0.0181,
"step": 554
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.1262059360742569,
"epoch": 1.4605263157894737,
"grad_norm": 0.027410009875893593,
"learning_rate": 1e-06,
"loss": 0.05,
"step": 555
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.12493745982646942,
"epoch": 1.4631578947368422,
"grad_norm": 0.011104092933237553,
"learning_rate": 1e-06,
"loss": 0.0094,
"step": 556
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6435.0,
"completions/mean_length": 829.318359375,
"completions/mean_terminated_length": 644.8755493164062,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"entropy": 0.1371982842683792,
"epoch": 1.4657894736842105,
"frac_reward_zero_std": 0.53125,
"grad_norm": 0.05659002810716629,
"learning_rate": 1e-06,
"loss": 0.0077,
"num_tokens": 178611268.0,
"reward": 0.8500331044197083,
"reward_std": 0.11168558895587921,
"rewards/progression_diversity/mean": -0.001577683025971055,
"rewards/progression_diversity/std": 0.019814975559711456,
"rewards/symbolic_reward_accuracy/mean": 0.9375,
"rewards/symbolic_reward_accuracy/std": 0.2422981858253479,
"rewards/symbolic_reward_partial_score/mean": 0.96240234375,
"rewards/symbolic_reward_partial_score/std": 0.17325447499752045,
"rewards/tag_count_reward/mean": -0.01171875,
"rewards/tag_count_reward/std": 0.10772226005792618,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0206682682037354,
"sampling/importance_sampling_ratio/min": 8.104348694359942e-07,
"sampling/sampling_logp_difference/max": 14.025694847106934,
"sampling/sampling_logp_difference/mean": 0.0689624696969986,
"step": 557
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.13341526687145233,
"epoch": 1.4684210526315788,
"grad_norm": 0.010691308416426182,
"learning_rate": 1e-06,
"loss": -0.005,
"step": 558
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.13226507604122162,
"epoch": 1.4710526315789474,
"grad_norm": 0.01808001846075058,
"learning_rate": 1e-06,
"loss": 0.0304,
"step": 559
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.078125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.12875394523143768,
"epoch": 1.4736842105263157,
"grad_norm": 0.01617010124027729,
"learning_rate": 1e-06,
"loss": 0.0914,
"step": 560
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009765625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5451.0,
"completions/mean_length": 817.533203125,
"completions/mean_terminated_length": 664.0177612304688,
"completions/min_length": 251.0,
"completions/min_terminated_length": 251.0,
"entropy": 0.1301540583372116,
"epoch": 1.4763157894736842,
"frac_reward_zero_std": 0.53125,
"grad_norm": 0.026065807789564133,
"learning_rate": 1e-06,
"loss": 0.0329,
"num_tokens": 179427317.0,
"reward": 0.8277164697647095,
"reward_std": 0.10951961576938629,
"rewards/progression_diversity/mean": -0.0017917966470122337,
"rewards/progression_diversity/std": 0.020995857194066048,
"rewards/symbolic_reward_accuracy/mean": 0.900390625,
"rewards/symbolic_reward_accuracy/std": 0.29977133870124817,
"rewards/symbolic_reward_partial_score/mean": 0.9615885019302368,
"rewards/symbolic_reward_partial_score/std": 0.14556662738323212,
"rewards/tag_count_reward/mean": -0.009765625,
"rewards/tag_count_reward/std": 0.09843364357948303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0219957828521729,
"sampling/importance_sampling_ratio/min": 3.670751175377518e-05,
"sampling/sampling_logp_difference/max": 10.212529182434082,
"sampling/sampling_logp_difference/mean": 0.0698399469256401,
"step": 561
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.13334877789020538,
"epoch": 1.4789473684210526,
"grad_norm": 0.008394371718168259,
"learning_rate": 1e-06,
"loss": 0.0013,
"step": 562
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.13843107223510742,
"epoch": 1.481578947368421,
"grad_norm": 0.04058850556612015,
"learning_rate": 1e-06,
"loss": 0.016,
"step": 563
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.13118517398834229,
"epoch": 1.4842105263157894,
"grad_norm": 0.01614241674542427,
"learning_rate": 1e-06,
"loss": 0.0288,
"step": 564
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6531.0,
"completions/mean_length": 730.0703125,
"completions/mean_terminated_length": 668.682373046875,
"completions/min_length": 258.0,
"completions/min_terminated_length": 258.0,
"entropy": 0.1351623460650444,
"epoch": 1.486842105263158,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.01471333485096693,
"learning_rate": 1e-06,
"loss": 0.0028,
"num_tokens": 180205241.0,
"reward": 0.8588860034942627,
"reward_std": 0.09307113289833069,
"rewards/progression_diversity/mean": -7.880153134465218e-05,
"rewards/progression_diversity/std": 0.0017830750439316034,
"rewards/symbolic_reward_accuracy/mean": 0.9453125,
"rewards/symbolic_reward_accuracy/std": 0.2275916188955307,
"rewards/symbolic_reward_partial_score/mean": 0.9736328125,
"rewards/symbolic_reward_partial_score/std": 0.13708151876926422,
"rewards/tag_count_reward/mean": -0.00390625,
"rewards/tag_count_reward/std": 0.06243881583213806,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0231963396072388,
"sampling/importance_sampling_ratio/min": 1.8753072481558775e-06,
"sampling/sampling_logp_difference/max": 13.186738014221191,
"sampling/sampling_logp_difference/mean": 0.07400602847337723,
"step": 565
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0859375,
"entropy": 0.13259483128786087,
"epoch": 1.4894736842105263,
"grad_norm": 0.011994445696473122,
"learning_rate": 1e-06,
"loss": 0.0206,
"step": 566
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1328125,
"entropy": 0.13419703394174576,
"epoch": 1.4921052631578946,
"grad_norm": 0.009042926132678986,
"learning_rate": 1e-06,
"loss": 0.0123,
"step": 567
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.140625,
"entropy": 0.13261277973651886,
"epoch": 1.4947368421052631,
"grad_norm": 0.013076446950435638,
"learning_rate": 1e-06,
"loss": 0.0181,
"step": 568
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017578125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7868.0,
"completions/mean_length": 976.060546875,
"completions/mean_terminated_length": 700.3717651367188,
"completions/min_length": 202.0,
"completions/min_terminated_length": 202.0,
"entropy": 0.13073333352804184,
"epoch": 1.4973684210526317,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.017082160338759422,
"learning_rate": 1e-06,
"loss": 0.0381,
"num_tokens": 181120984.0,
"reward": 0.8026704788208008,
"reward_std": 0.17961543798446655,
"rewards/progression_diversity/mean": -0.0015097158029675484,
"rewards/progression_diversity/std": 0.018514791503548622,
"rewards/symbolic_reward_accuracy/mean": 0.876953125,
"rewards/symbolic_reward_accuracy/std": 0.32881227135658264,
"rewards/symbolic_reward_partial_score/mean": 0.9275716543197632,
"rewards/symbolic_reward_partial_score/std": 0.2205788642168045,
"rewards/tag_count_reward/mean": -0.017578125,
"rewards/tag_count_reward/std": 0.13154059648513794,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0213254690170288,
"sampling/importance_sampling_ratio/min": 2.747718781392905e-07,
"sampling/sampling_logp_difference/max": 15.107324600219727,
"sampling/sampling_logp_difference/mean": 0.07009261846542358,
"step": 569
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.1352020874619484,
"epoch": 1.5,
"grad_norm": 0.02589314803481102,
"learning_rate": 1e-06,
"loss": 0.0211,
"step": 570
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.13306277990341187,
"epoch": 1.5026315789473683,
"grad_norm": 0.018376614898443222,
"learning_rate": 1e-06,
"loss": 0.014,
"step": 571
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.1296408474445343,
"epoch": 1.5052631578947369,
"grad_norm": 0.028880199417471886,
"learning_rate": 1e-06,
"loss": 0.0695,
"step": 572
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5029.0,
"completions/mean_length": 1003.607421875,
"completions/mean_terminated_length": 665.9141845703125,
"completions/min_length": 216.0,
"completions/min_terminated_length": 216.0,
"entropy": 0.1362362802028656,
"epoch": 1.5078947368421054,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.03721587732434273,
"learning_rate": 1e-06,
"loss": 0.0271,
"num_tokens": 182032623.0,
"reward": 0.8145319223403931,
"reward_std": 0.13320812582969666,
"rewards/progression_diversity/mean": -0.001894364831969142,
"rewards/progression_diversity/std": 0.017769131809473038,
"rewards/symbolic_reward_accuracy/mean": 0.892578125,
"rewards/symbolic_reward_accuracy/std": 0.30995169281959534,
"rewards/symbolic_reward_partial_score/mean": 0.9371744990348816,
"rewards/symbolic_reward_partial_score/std": 0.2098037451505661,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0225517749786377,
"sampling/importance_sampling_ratio/min": 1.7453003763891715e-15,
"sampling/sampling_logp_difference/max": 33.981849670410156,
"sampling/sampling_logp_difference/mean": 0.07218128442764282,
"step": 573
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.125,
"entropy": 0.1370982825756073,
"epoch": 1.5105263157894737,
"grad_norm": 0.01768220029771328,
"learning_rate": 1e-06,
"loss": 0.0052,
"step": 574
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.125,
"entropy": 0.1406766176223755,
"epoch": 1.513157894736842,
"grad_norm": 0.025351712480187416,
"learning_rate": 1e-06,
"loss": 0.0384,
"step": 575
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.1350262686610222,
"epoch": 1.5157894736842106,
"grad_norm": 0.012894677929580212,
"learning_rate": 1e-06,
"loss": 0.0346,
"step": 576
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6116.0,
"completions/mean_length": 1377.828125,
"completions/mean_terminated_length": 831.0445556640625,
"completions/min_length": 322.0,
"completions/min_terminated_length": 322.0,
"entropy": 0.13445448130369186,
"epoch": 1.518421052631579,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.034472037106752396,
"learning_rate": 1e-06,
"loss": 0.0196,
"num_tokens": 183164919.0,
"reward": 0.7574737071990967,
"reward_std": 0.18111687898635864,
"rewards/progression_diversity/mean": -0.004586691036820412,
"rewards/progression_diversity/std": 0.0347183421254158,
"rewards/symbolic_reward_accuracy/mean": 0.822265625,
"rewards/symbolic_reward_accuracy/std": 0.3826628625392914,
"rewards/symbolic_reward_partial_score/mean": 0.8916015625,
"rewards/symbolic_reward_partial_score/std": 0.2719910144805908,
"rewards/tag_count_reward/mean": -0.033203125,
"rewards/tag_count_reward/std": 0.17934183776378632,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0212225914001465,
"sampling/importance_sampling_ratio/min": 3.2291410434481804e-08,
"sampling/sampling_logp_difference/max": 17.248464584350586,
"sampling/sampling_logp_difference/mean": 0.06880679726600647,
"step": 577
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.1360417976975441,
"epoch": 1.5210526315789474,
"grad_norm": 0.01104611437767744,
"learning_rate": 1e-06,
"loss": 0.0152,
"step": 578
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.12674961611628532,
"epoch": 1.5236842105263158,
"grad_norm": 0.045898765325546265,
"learning_rate": 1e-06,
"loss": 0.0459,
"step": 579
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.125128373503685,
"epoch": 1.526315789473684,
"grad_norm": 0.015659412369132042,
"learning_rate": 1e-06,
"loss": 0.0575,
"step": 580
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5905.0,
"completions/mean_length": 1173.80859375,
"completions/mean_terminated_length": 714.7484741210938,
"completions/min_length": 244.0,
"completions/min_terminated_length": 244.0,
"entropy": 0.12383318692445755,
"epoch": 1.5289473684210526,
"frac_reward_zero_std": 0.53125,
"grad_norm": 0.04507393762469292,
"learning_rate": 1e-06,
"loss": 0.0544,
"num_tokens": 184160693.0,
"reward": 0.8106682300567627,
"reward_std": 0.1318516582250595,
"rewards/progression_diversity/mean": -0.0025172452442348003,
"rewards/progression_diversity/std": 0.021089140325784683,
"rewards/symbolic_reward_accuracy/mean": 0.890625,
"rewards/symbolic_reward_accuracy/std": 0.31241437792778015,
"rewards/symbolic_reward_partial_score/mean": 0.9308268427848816,
"rewards/symbolic_reward_partial_score/std": 0.2209477424621582,
"rewards/tag_count_reward/mean": -0.029296875,
"rewards/tag_count_reward/std": 0.16880230605602264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0210371017456055,
"sampling/importance_sampling_ratio/min": 1.1693467028006665e-10,
"sampling/sampling_logp_difference/max": 22.86940574645996,
"sampling/sampling_logp_difference/mean": 0.07237155735492706,
"step": 581
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.078125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.12782908231019974,
"epoch": 1.5315789473684212,
"grad_norm": 0.010001985356211662,
"learning_rate": 1e-06,
"loss": 0.0502,
"step": 582
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1328125,
"entropy": 0.12940338253974915,
"epoch": 1.5342105263157895,
"grad_norm": 0.022164426743984222,
"learning_rate": 1e-06,
"loss": 0.0063,
"step": 583
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0234375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1015625,
"entropy": 0.1259337067604065,
"epoch": 1.5368421052631578,
"grad_norm": 0.010080978274345398,
"learning_rate": 1e-06,
"loss": 0.0076,
"step": 584
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5788.0,
"completions/mean_length": 1110.978515625,
"completions/mean_terminated_length": 744.426025390625,
"completions/min_length": 234.0,
"completions/min_terminated_length": 234.0,
"entropy": 0.11713157966732979,
"epoch": 1.5394736842105263,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.034748658537864685,
"learning_rate": 1e-06,
"loss": 0.0692,
"num_tokens": 185156842.0,
"reward": 0.8088133931159973,
"reward_std": 0.16672645509243011,
"rewards/progression_diversity/mean": -0.0024531371891498566,
"rewards/progression_diversity/std": 0.021702419966459274,
"rewards/symbolic_reward_accuracy/mean": 0.884765625,
"rewards/symbolic_reward_accuracy/std": 0.3196168541908264,
"rewards/symbolic_reward_partial_score/mean": 0.9337564706802368,
"rewards/symbolic_reward_partial_score/std": 0.2161990851163864,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.019039511680603,
"sampling/importance_sampling_ratio/min": 1.6557880826439941e-06,
"sampling/sampling_logp_difference/max": 13.311233520507812,
"sampling/sampling_logp_difference/mean": 0.06788427382707596,
"step": 585
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.12097888439893723,
"epoch": 1.5421052631578949,
"grad_norm": 0.02093541994690895,
"learning_rate": 1e-06,
"loss": 0.0109,
"step": 586
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.12494004517793655,
"epoch": 1.5447368421052632,
"grad_norm": 0.012646614573895931,
"learning_rate": 1e-06,
"loss": -0.0019,
"step": 587
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.12453506141901016,
"epoch": 1.5473684210526315,
"grad_norm": 0.020611083135008812,
"learning_rate": 1e-06,
"loss": 0.0183,
"step": 588
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4762.0,
"completions/mean_length": 1052.017578125,
"completions/mean_terminated_length": 715.38720703125,
"completions/min_length": 232.0,
"completions/min_terminated_length": 232.0,
"entropy": 0.1217985488474369,
"epoch": 1.55,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.038163021206855774,
"learning_rate": 1e-06,
"loss": 0.0278,
"num_tokens": 186078931.0,
"reward": 0.8041250109672546,
"reward_std": 0.16053986549377441,
"rewards/progression_diversity/mean": -0.002542970236390829,
"rewards/progression_diversity/std": 0.02396521344780922,
"rewards/symbolic_reward_accuracy/mean": 0.8828125,
"rewards/symbolic_reward_accuracy/std": 0.32195815443992615,
"rewards/symbolic_reward_partial_score/mean": 0.9220377206802368,
"rewards/symbolic_reward_partial_score/std": 0.2436649203300476,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0195398330688477,
"sampling/importance_sampling_ratio/min": 1.0385434734416776e-07,
"sampling/sampling_logp_difference/max": 16.080276489257812,
"sampling/sampling_logp_difference/mean": 0.07104349136352539,
"step": 589
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.12670139968395233,
"epoch": 1.5526315789473686,
"grad_norm": 0.02337627112865448,
"learning_rate": 1e-06,
"loss": 0.0112,
"step": 590
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.12126578390598297,
"epoch": 1.555263157894737,
"grad_norm": 0.02081996016204357,
"learning_rate": 1e-06,
"loss": 0.0416,
"step": 591
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.121041189879179,
"epoch": 1.5578947368421052,
"grad_norm": 0.014259042218327522,
"learning_rate": 1e-06,
"loss": 0.0228,
"step": 592
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6324.0,
"completions/mean_length": 1310.251953125,
"completions/mean_terminated_length": 761.006103515625,
"completions/min_length": 207.0,
"completions/min_terminated_length": 207.0,
"entropy": 0.11463157832622528,
"epoch": 1.5605263157894735,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.03503033518791199,
"learning_rate": 1e-06,
"loss": 0.0623,
"num_tokens": 187157940.0,
"reward": 0.774095892906189,
"reward_std": 0.19663885235786438,
"rewards/progression_diversity/mean": -0.002528228797018528,
"rewards/progression_diversity/std": 0.020892662927508354,
"rewards/symbolic_reward_accuracy/mean": 0.837890625,
"rewards/symbolic_reward_accuracy/std": 0.3689115643501282,
"rewards/symbolic_reward_partial_score/mean": 0.9150390625,
"rewards/symbolic_reward_partial_score/std": 0.22795869410037994,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0178115367889404,
"sampling/importance_sampling_ratio/min": 1.8364110587754112e-07,
"sampling/sampling_logp_difference/max": 15.510282516479492,
"sampling/sampling_logp_difference/mean": 0.06724664568901062,
"step": 593
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.11785297840833664,
"epoch": 1.563157894736842,
"grad_norm": 0.015631457790732384,
"learning_rate": 1e-06,
"loss": 0.0155,
"step": 594
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.11847185716032982,
"epoch": 1.5657894736842106,
"grad_norm": 0.025584295392036438,
"learning_rate": 1e-06,
"loss": 0.034,
"step": 595
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.12040146440267563,
"epoch": 1.568421052631579,
"grad_norm": 0.024837322533130646,
"learning_rate": 1e-06,
"loss": 0.0075,
"step": 596
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7156.0,
"completions/mean_length": 1281.55859375,
"completions/mean_terminated_length": 762.888916015625,
"completions/min_length": 249.0,
"completions/min_terminated_length": 249.0,
"entropy": 0.11434166878461838,
"epoch": 1.5710526315789473,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.031757909804582596,
"learning_rate": 1e-06,
"loss": 0.0399,
"num_tokens": 188220754.0,
"reward": 0.7410222291946411,
"reward_std": 0.2251667082309723,
"rewards/progression_diversity/mean": -0.004226570948958397,
"rewards/progression_diversity/std": 0.034974951297044754,
"rewards/symbolic_reward_accuracy/mean": 0.8046875,
"rewards/symbolic_reward_accuracy/std": 0.3968288004398346,
"rewards/symbolic_reward_partial_score/mean": 0.8719075322151184,
"rewards/symbolic_reward_partial_score/std": 0.29337501525878906,
"rewards/tag_count_reward/mean": -0.033203125,
"rewards/tag_count_reward/std": 0.17934183776378632,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.018049955368042,
"sampling/importance_sampling_ratio/min": 1.5532393717876403e-06,
"sampling/sampling_logp_difference/max": 13.375167846679688,
"sampling/sampling_logp_difference/mean": 0.06641073524951935,
"step": 597
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.1158062033355236,
"epoch": 1.5736842105263158,
"grad_norm": 0.05890418589115143,
"learning_rate": 1e-06,
"loss": 0.0376,
"step": 598
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.11098961159586906,
"epoch": 1.5763157894736843,
"grad_norm": 0.0232480950653553,
"learning_rate": 1e-06,
"loss": 0.042,
"step": 599
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.11672716960310936,
"epoch": 1.5789473684210527,
"grad_norm": 0.019831934943795204,
"learning_rate": 1e-06,
"loss": 0.0268,
"step": 600
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6496.0,
"completions/mean_length": 1332.814453125,
"completions/mean_terminated_length": 752.7484741210938,
"completions/min_length": 233.0,
"completions/min_terminated_length": 233.0,
"entropy": 0.11477554962038994,
"epoch": 1.581578947368421,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.03628187254071236,
"learning_rate": 1e-06,
"loss": 0.018,
"num_tokens": 189330323.0,
"reward": 0.7476116418838501,
"reward_std": 0.23416918516159058,
"rewards/progression_diversity/mean": -0.004457623697817326,
"rewards/progression_diversity/std": 0.032657936215400696,
"rewards/symbolic_reward_accuracy/mean": 0.810546875,
"rewards/symbolic_reward_accuracy/std": 0.3922513723373413,
"rewards/symbolic_reward_partial_score/mean": 0.8834635019302368,
"rewards/symbolic_reward_partial_score/std": 0.28327476978302,
"rewards/tag_count_reward/mean": -0.037109375,
"rewards/tag_count_reward/std": 0.18921469151973724,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0170378684997559,
"sampling/importance_sampling_ratio/min": 2.704789210383751e-07,
"sampling/sampling_logp_difference/max": 15.123071670532227,
"sampling/sampling_logp_difference/mean": 0.06537644565105438,
"step": 601
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.11666785180568695,
"epoch": 1.5842105263157895,
"grad_norm": 0.03475677967071533,
"learning_rate": 1e-06,
"loss": 0.0451,
"step": 602
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.11766305565834045,
"epoch": 1.586842105263158,
"grad_norm": 0.016544286161661148,
"learning_rate": 1e-06,
"loss": 0.0143,
"step": 603
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.11409174650907516,
"epoch": 1.5894736842105264,
"grad_norm": 0.04165460914373398,
"learning_rate": 1e-06,
"loss": 0.0841,
"step": 604
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7985.0,
"completions/mean_length": 1443.87890625,
"completions/mean_terminated_length": 773.097900390625,
"completions/min_length": 209.0,
"completions/min_terminated_length": 209.0,
"entropy": 0.11511170864105225,
"epoch": 1.5921052631578947,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.030341695994138718,
"learning_rate": 1e-06,
"loss": 0.0233,
"num_tokens": 190460885.0,
"reward": 0.7671371698379517,
"reward_std": 0.20124712586402893,
"rewards/progression_diversity/mean": -0.005033410154283047,
"rewards/progression_diversity/std": 0.03523511067032814,
"rewards/symbolic_reward_accuracy/mean": 0.83984375,
"rewards/symbolic_reward_accuracy/std": 0.3671095669269562,
"rewards/symbolic_reward_partial_score/mean": 0.8912760615348816,
"rewards/symbolic_reward_partial_score/std": 0.28599029779434204,
"rewards/tag_count_reward/mean": -0.041015625,
"rewards/tag_count_reward/std": 0.19852031767368317,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0175724029541016,
"sampling/importance_sampling_ratio/min": 5.44225827070477e-07,
"sampling/sampling_logp_difference/max": 14.423901557922363,
"sampling/sampling_logp_difference/mean": 0.06623612344264984,
"step": 605
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.11644968762993813,
"epoch": 1.594736842105263,
"grad_norm": 0.019640203565359116,
"learning_rate": 1e-06,
"loss": 0.0203,
"step": 606
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.11942460760474205,
"epoch": 1.5973684210526315,
"grad_norm": 0.018807774409651756,
"learning_rate": 1e-06,
"loss": 0.0451,
"step": 607
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.11645659804344177,
"epoch": 1.6,
"grad_norm": 0.018924150615930557,
"learning_rate": 1e-06,
"loss": 0.044,
"step": 608
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6067.0,
"completions/mean_length": 829.654296875,
"completions/mean_terminated_length": 737.9783935546875,
"completions/min_length": 273.0,
"completions/min_terminated_length": 273.0,
"entropy": 0.12432442978024483,
"epoch": 1.6026315789473684,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.025178352370858192,
"learning_rate": 1e-06,
"loss": 0.0161,
"num_tokens": 191283460.0,
"reward": 0.8110204339027405,
"reward_std": 0.1572936773300171,
"rewards/progression_diversity/mean": -0.001474375487305224,
"rewards/progression_diversity/std": 0.02059447206556797,
"rewards/symbolic_reward_accuracy/mean": 0.884765625,
"rewards/symbolic_reward_accuracy/std": 0.3196168541908264,
"rewards/symbolic_reward_partial_score/mean": 0.9358724355697632,
"rewards/symbolic_reward_partial_score/std": 0.20960327982902527,
"rewards/tag_count_reward/mean": -0.005859375,
"rewards/tag_count_reward/std": 0.07639661431312561,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0208200216293335,
"sampling/importance_sampling_ratio/min": 8.66128682305932e-22,
"sampling/sampling_logp_difference/max": 48.498008728027344,
"sampling/sampling_logp_difference/mean": 0.07230201363563538,
"step": 609
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.12939514964818954,
"epoch": 1.6052631578947367,
"grad_norm": 0.012045629322528839,
"learning_rate": 1e-06,
"loss": 0.0036,
"step": 610
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.12374605610966682,
"epoch": 1.6078947368421053,
"grad_norm": 0.019549217075109482,
"learning_rate": 1e-06,
"loss": 0.0286,
"step": 611
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.12776687741279602,
"epoch": 1.6105263157894738,
"grad_norm": 0.014186064712703228,
"learning_rate": 1e-06,
"loss": 0.0144,
"step": 612
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5412.0,
"completions/mean_length": 941.5625,
"completions/mean_terminated_length": 758.4506225585938,
"completions/min_length": 236.0,
"completions/min_terminated_length": 236.0,
"entropy": 0.12887892127037048,
"epoch": 1.6131578947368421,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.04177960380911827,
"learning_rate": 1e-06,
"loss": 0.0415,
"num_tokens": 192155012.0,
"reward": 0.7932851314544678,
"reward_std": 0.19080910086631775,
"rewards/progression_diversity/mean": -0.0025479630567133427,
"rewards/progression_diversity/std": 0.03208750858902931,
"rewards/symbolic_reward_accuracy/mean": 0.87109375,
"rewards/symbolic_reward_accuracy/std": 0.33542385697364807,
"rewards/symbolic_reward_partial_score/mean": 0.90478515625,
"rewards/symbolic_reward_partial_score/std": 0.272712379693985,
"rewards/tag_count_reward/mean": -0.0078125,
"rewards/tag_count_reward/std": 0.08812850713729858,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0208125114440918,
"sampling/importance_sampling_ratio/min": 1.5786597487021936e-06,
"sampling/sampling_logp_difference/max": 13.35893440246582,
"sampling/sampling_logp_difference/mean": 0.07315555959939957,
"step": 613
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.1350940614938736,
"epoch": 1.6157894736842104,
"grad_norm": 0.012729027308523655,
"learning_rate": 1e-06,
"loss": 0.0034,
"step": 614
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.12994906306266785,
"epoch": 1.618421052631579,
"grad_norm": 0.02224569581449032,
"learning_rate": 1e-06,
"loss": 0.0451,
"step": 615
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.13485563546419144,
"epoch": 1.6210526315789475,
"grad_norm": 0.023328816518187523,
"learning_rate": 1e-06,
"loss": 0.0112,
"step": 616
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4567.0,
"completions/mean_length": 1037.134765625,
"completions/mean_terminated_length": 700.1776733398438,
"completions/min_length": 267.0,
"completions/min_terminated_length": 267.0,
"entropy": 0.13107465207576752,
"epoch": 1.6236842105263158,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.03819314390420914,
"learning_rate": 1e-06,
"loss": 0.0452,
"num_tokens": 193093321.0,
"reward": 0.790679931640625,
"reward_std": 0.1968342363834381,
"rewards/progression_diversity/mean": -0.004279204178601503,
"rewards/progression_diversity/std": 0.03591744601726532,
"rewards/symbolic_reward_accuracy/mean": 0.861328125,
"rewards/symbolic_reward_accuracy/std": 0.34594178199768066,
"rewards/symbolic_reward_partial_score/mean": 0.9202473759651184,
"rewards/symbolic_reward_partial_score/std": 0.24266402423381805,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0211315155029297,
"sampling/importance_sampling_ratio/min": 5.997556399961468e-06,
"sampling/sampling_logp_difference/max": 12.024158477783203,
"sampling/sampling_logp_difference/mean": 0.07138025015592575,
"step": 617
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.13448263704776764,
"epoch": 1.6263157894736842,
"grad_norm": 0.0195352453738451,
"learning_rate": 1e-06,
"loss": 0.0212,
"step": 618
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.12872284650802612,
"epoch": 1.6289473684210525,
"grad_norm": 0.035076647996902466,
"learning_rate": 1e-06,
"loss": 0.0472,
"step": 619
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.12851262837648392,
"epoch": 1.631578947368421,
"grad_norm": 0.017521562054753304,
"learning_rate": 1e-06,
"loss": 0.0411,
"step": 620
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5643.0,
"completions/mean_length": 1071.30078125,
"completions/mean_terminated_length": 766.2669677734375,
"completions/min_length": 282.0,
"completions/min_terminated_length": 282.0,
"entropy": 0.12900108844041824,
"epoch": 1.6342105263157896,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.03199876844882965,
"learning_rate": 1e-06,
"loss": 0.0156,
"num_tokens": 194057987.0,
"reward": 0.7672144174575806,
"reward_std": 0.19012746214866638,
"rewards/progression_diversity/mean": -0.002193653956055641,
"rewards/progression_diversity/std": 0.022286431863904,
"rewards/symbolic_reward_accuracy/mean": 0.830078125,
"rewards/symbolic_reward_accuracy/std": 0.3759314715862274,
"rewards/symbolic_reward_partial_score/mean": 0.9031575918197632,
"rewards/symbolic_reward_partial_score/std": 0.2527180016040802,
"rewards/tag_count_reward/mean": -0.017578125,
"rewards/tag_count_reward/std": 0.13154059648513794,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0220717191696167,
"sampling/importance_sampling_ratio/min": 3.4926625480657947e-10,
"sampling/sampling_logp_difference/max": 21.77518653869629,
"sampling/sampling_logp_difference/mean": 0.0735945776104927,
"step": 621
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.13501236587762833,
"epoch": 1.6368421052631579,
"grad_norm": 0.023030543699860573,
"learning_rate": 1e-06,
"loss": -0.0017,
"step": 622
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.13038154691457748,
"epoch": 1.6394736842105262,
"grad_norm": 0.020995106548070908,
"learning_rate": 1e-06,
"loss": 0.0482,
"step": 623
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.13032087683677673,
"epoch": 1.6421052631578947,
"grad_norm": 0.01794985868036747,
"learning_rate": 1e-06,
"loss": 0.0357,
"step": 624
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4610.0,
"completions/mean_length": 1164.958984375,
"completions/mean_terminated_length": 578.4239501953125,
"completions/min_length": 273.0,
"completions/min_terminated_length": 273.0,
"entropy": 0.13446198403835297,
"epoch": 1.6447368421052633,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.02450123056769371,
"learning_rate": 1e-06,
"loss": 0.0266,
"num_tokens": 195026894.0,
"reward": 0.8016812801361084,
"reward_std": 0.14666664600372314,
"rewards/progression_diversity/mean": -0.007652563974261284,
"rewards/progression_diversity/std": 0.04430336132645607,
"rewards/symbolic_reward_accuracy/mean": 0.880859375,
"rewards/symbolic_reward_accuracy/std": 0.32427072525024414,
"rewards/symbolic_reward_partial_score/mean": 0.9231771230697632,
"rewards/symbolic_reward_partial_score/std": 0.2403518557548523,
"rewards/tag_count_reward/mean": -0.037109375,
"rewards/tag_count_reward/std": 0.18921469151973724,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.02314293384552,
"sampling/importance_sampling_ratio/min": 3.1002757168607786e-05,
"sampling/sampling_logp_difference/max": 10.381434440612793,
"sampling/sampling_logp_difference/mean": 0.07610031962394714,
"step": 625
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.13796798884868622,
"epoch": 1.6473684210526316,
"grad_norm": 0.03212736174464226,
"learning_rate": 1e-06,
"loss": 0.0522,
"step": 626
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.13732363283634186,
"epoch": 1.65,
"grad_norm": 0.03413732722401619,
"learning_rate": 1e-06,
"loss": 0.0207,
"step": 627
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.1372433453798294,
"epoch": 1.6526315789473685,
"grad_norm": 0.029417581856250763,
"learning_rate": 1e-06,
"loss": 0.0398,
"step": 628
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5332.0,
"completions/mean_length": 1251.904296875,
"completions/mean_terminated_length": 636.7784423828125,
"completions/min_length": 261.0,
"completions/min_terminated_length": 261.0,
"entropy": 0.13081193715333939,
"epoch": 1.655263157894737,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.03926420584321022,
"learning_rate": 1e-06,
"loss": 0.086,
"num_tokens": 196044989.0,
"reward": 0.7881196737289429,
"reward_std": 0.17913874983787537,
"rewards/progression_diversity/mean": -0.00639567244797945,
"rewards/progression_diversity/std": 0.04198707640171051,
"rewards/symbolic_reward_accuracy/mean": 0.865234375,
"rewards/symbolic_reward_accuracy/std": 0.3418070077896118,
"rewards/symbolic_reward_partial_score/mean": 0.9085286855697632,
"rewards/symbolic_reward_partial_score/std": 0.2608047127723694,
"rewards/tag_count_reward/mean": -0.03515625,
"rewards/tag_count_reward/std": 0.1843547374010086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.023348331451416,
"sampling/importance_sampling_ratio/min": 1.0149918125534896e-05,
"sampling/sampling_logp_difference/max": 11.498044967651367,
"sampling/sampling_logp_difference/mean": 0.07550258934497833,
"step": 629
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.138485848903656,
"epoch": 1.6578947368421053,
"grad_norm": 0.01309316884726286,
"learning_rate": 1e-06,
"loss": 0.0257,
"step": 630
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.13896264880895615,
"epoch": 1.6605263157894736,
"grad_norm": 0.012009157799184322,
"learning_rate": 1e-06,
"loss": 0.0538,
"step": 631
},
{
"clip_ratio/high_max": 0.25,
"clip_ratio/high_mean": 0.0390625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.109375,
"entropy": 0.1417469009757042,
"epoch": 1.663157894736842,
"grad_norm": 0.024768110364675522,
"learning_rate": 1e-06,
"loss": 0.0391,
"step": 632
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6066.0,
"completions/mean_length": 731.333984375,
"completions/mean_terminated_length": 639.07861328125,
"completions/min_length": 244.0,
"completions/min_terminated_length": 244.0,
"entropy": 0.14609570801258087,
"epoch": 1.6657894736842105,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.017739422619342804,
"learning_rate": 1e-06,
"loss": 0.0108,
"num_tokens": 196802888.0,
"reward": 0.8241581916809082,
"reward_std": 0.1724630892276764,
"rewards/progression_diversity/mean": -0.0011804921086877584,
"rewards/progression_diversity/std": 0.019015712663531303,
"rewards/symbolic_reward_accuracy/mean": 0.908203125,
"rewards/symbolic_reward_accuracy/std": 0.289021372795105,
"rewards/symbolic_reward_partial_score/mean": 0.9327799677848816,
"rewards/symbolic_reward_partial_score/std": 0.2337876856327057,
"rewards/tag_count_reward/mean": -0.005859375,
"rewards/tag_count_reward/std": 0.07639661431312561,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0246440172195435,
"sampling/importance_sampling_ratio/min": 1.7091028894355986e-06,
"sampling/sampling_logp_difference/max": 13.279541969299316,
"sampling/sampling_logp_difference/mean": 0.08123795688152313,
"step": 633
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.14178509265184402,
"epoch": 1.668421052631579,
"grad_norm": 0.027597403153777122,
"learning_rate": 1e-06,
"loss": 0.0282,
"step": 634
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.14551638811826706,
"epoch": 1.6710526315789473,
"grad_norm": 0.006523482967168093,
"learning_rate": 1e-06,
"loss": 0.0247,
"step": 635
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.1471918523311615,
"epoch": 1.6736842105263157,
"grad_norm": 0.019637571647763252,
"learning_rate": 1e-06,
"loss": 0.0191,
"step": 636
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2661.0,
"completions/mean_length": 711.6328125,
"completions/mean_terminated_length": 588.2283325195312,
"completions/min_length": 216.0,
"completions/min_terminated_length": 216.0,
"entropy": 0.15460152179002762,
"epoch": 1.6763157894736842,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.016574524343013763,
"learning_rate": 1e-06,
"loss": 0.001,
"num_tokens": 197560204.0,
"reward": 0.855990469455719,
"reward_std": 0.12720367312431335,
"rewards/progression_diversity/mean": -0.0015454285312443972,
"rewards/progression_diversity/std": 0.024757709354162216,
"rewards/symbolic_reward_accuracy/mean": 0.943359375,
"rewards/symbolic_reward_accuracy/std": 0.23138070106506348,
"rewards/symbolic_reward_partial_score/mean": 0.9685872793197632,
"rewards/symbolic_reward_partial_score/std": 0.14984627068042755,
"rewards/tag_count_reward/mean": -0.005859375,
"rewards/tag_count_reward/std": 0.07639661431312561,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0268505811691284,
"sampling/importance_sampling_ratio/min": 1.079319758900965e-06,
"sampling/sampling_logp_difference/max": 13.739179611206055,
"sampling/sampling_logp_difference/mean": 0.08289426565170288,
"step": 637
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.15232256054878235,
"epoch": 1.6789473684210527,
"grad_norm": 0.00705451937392354,
"learning_rate": 1e-06,
"loss": -0.0044,
"step": 638
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.1456858143210411,
"epoch": 1.681578947368421,
"grad_norm": 0.025568673387169838,
"learning_rate": 1e-06,
"loss": 0.0668,
"step": 639
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.1482095867395401,
"epoch": 1.6842105263157894,
"grad_norm": 0.02351662516593933,
"learning_rate": 1e-06,
"loss": 0.0198,
"step": 640
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5108.0,
"completions/mean_length": 814.15234375,
"completions/mean_terminated_length": 629.5296630859375,
"completions/min_length": 209.0,
"completions/min_terminated_length": 209.0,
"entropy": 0.14524078369140625,
"epoch": 1.686842105263158,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.02617044374346733,
"learning_rate": 1e-06,
"loss": 0.0027,
"num_tokens": 198374842.0,
"reward": 0.8252705335617065,
"reward_std": 0.16813138127326965,
"rewards/progression_diversity/mean": -0.0022450610995292664,
"rewards/progression_diversity/std": 0.032739993184804916,
"rewards/symbolic_reward_accuracy/mean": 0.90234375,
"rewards/symbolic_reward_accuracy/std": 0.29713961482048035,
"rewards/symbolic_reward_partial_score/mean": 0.9488931894302368,
"rewards/symbolic_reward_partial_score/std": 0.18802446126937866,
"rewards/tag_count_reward/mean": -0.0078125,
"rewards/tag_count_reward/std": 0.08812850713729858,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0245275497436523,
"sampling/importance_sampling_ratio/min": 5.705417915891076e-09,
"sampling/sampling_logp_difference/max": 18.981849670410156,
"sampling/sampling_logp_difference/mean": 0.07951345294713974,
"step": 641
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.1453712061047554,
"epoch": 1.6894736842105265,
"grad_norm": 0.030677122995257378,
"learning_rate": 1e-06,
"loss": 0.0483,
"step": 642
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.14691882580518723,
"epoch": 1.6921052631578948,
"grad_norm": 0.009873943403363228,
"learning_rate": 1e-06,
"loss": -0.0022,
"step": 643
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.14735910296440125,
"epoch": 1.694736842105263,
"grad_norm": 0.017524579539895058,
"learning_rate": 1e-06,
"loss": 0.0235,
"step": 644
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.060546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4121.0,
"completions/mean_length": 1686.984375,
"completions/mean_terminated_length": 739.7754516601562,
"completions/min_length": 218.0,
"completions/min_terminated_length": 218.0,
"entropy": 0.14734282344579697,
"epoch": 1.6973684210526314,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.03577494993805885,
"learning_rate": 1e-06,
"loss": 0.0143,
"num_tokens": 199650066.0,
"reward": 0.7072658538818359,
"reward_std": 0.2004225254058838,
"rewards/progression_diversity/mean": -0.010720351710915565,
"rewards/progression_diversity/std": 0.056672148406505585,
"rewards/symbolic_reward_accuracy/mean": 0.763671875,
"rewards/symbolic_reward_accuracy/std": 0.42524150013923645,
"rewards/symbolic_reward_partial_score/mean": 0.85009765625,
"rewards/symbolic_reward_partial_score/std": 0.32125625014305115,
"rewards/tag_count_reward/mean": -0.05859375,
"rewards/tag_count_reward/std": 0.23509246110916138,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0221692323684692,
"sampling/importance_sampling_ratio/min": 1.0171705753236893e-06,
"sampling/sampling_logp_difference/max": 13.79848575592041,
"sampling/sampling_logp_difference/mean": 0.06588836014270782,
"step": 645
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.13180600851774216,
"epoch": 1.7,
"grad_norm": 0.016572780907154083,
"learning_rate": 1e-06,
"loss": 0.0562,
"step": 646
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.13006508350372314,
"epoch": 1.7026315789473685,
"grad_norm": 0.024881578981876373,
"learning_rate": 1e-06,
"loss": 0.1149,
"step": 647
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.046875,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.13194820284843445,
"epoch": 1.7052631578947368,
"grad_norm": 0.017025984823703766,
"learning_rate": 1e-06,
"loss": 0.0827,
"step": 648
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13227.0,
"completions/mean_length": 966.30859375,
"completions/mean_terminated_length": 721.5833740234375,
"completions/min_length": 220.0,
"completions/min_terminated_length": 220.0,
"entropy": 0.15003421157598495,
"epoch": 1.7078947368421051,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.026492591947317123,
"learning_rate": 1e-06,
"loss": 0.0272,
"num_tokens": 200547280.0,
"reward": 0.7986052632331848,
"reward_std": 0.18857163190841675,
"rewards/progression_diversity/mean": -0.0027552107349038124,
"rewards/progression_diversity/std": 0.026247508823871613,
"rewards/symbolic_reward_accuracy/mean": 0.875,
"rewards/symbolic_reward_accuracy/std": 0.3310423493385315,
"rewards/symbolic_reward_partial_score/mean": 0.9153645634651184,
"rewards/symbolic_reward_partial_score/std": 0.25720417499542236,
"rewards/tag_count_reward/mean": -0.009765625,
"rewards/tag_count_reward/std": 0.09843364357948303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.025015115737915,
"sampling/importance_sampling_ratio/min": 3.4338822274548875e-07,
"sampling/sampling_logp_difference/max": 14.884404182434082,
"sampling/sampling_logp_difference/mean": 0.07899387180805206,
"step": 649
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.15113919973373413,
"epoch": 1.7105263157894737,
"grad_norm": 0.029967421665787697,
"learning_rate": 1e-06,
"loss": 0.04,
"step": 650
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.14088765531778336,
"epoch": 1.7131578947368422,
"grad_norm": 0.04313409700989723,
"learning_rate": 1e-06,
"loss": 0.0421,
"step": 651
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.14788048714399338,
"epoch": 1.7157894736842105,
"grad_norm": 0.014722016640007496,
"learning_rate": 1e-06,
"loss": 0.0357,
"step": 652
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.013671875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4130.0,
"completions/mean_length": 835.263671875,
"completions/mean_terminated_length": 619.7366333007812,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"entropy": 0.14688391238451004,
"epoch": 1.7184210526315788,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.025073617696762085,
"learning_rate": 1e-06,
"loss": 0.0165,
"num_tokens": 201358551.0,
"reward": 0.7871145606040955,
"reward_std": 0.2015751600265503,
"rewards/progression_diversity/mean": -0.004367514047771692,
"rewards/progression_diversity/std": 0.043641433119773865,
"rewards/symbolic_reward_accuracy/mean": 0.8515625,
"rewards/symbolic_reward_accuracy/std": 0.35588082671165466,
"rewards/symbolic_reward_partial_score/mean": 0.92333984375,
"rewards/symbolic_reward_partial_score/std": 0.23156411945819855,
"rewards/tag_count_reward/mean": -0.0078125,
"rewards/tag_count_reward/std": 0.08812850713729858,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0254812240600586,
"sampling/importance_sampling_ratio/min": 2.4464074429436655e-12,
"sampling/sampling_logp_difference/max": 26.736400604248047,
"sampling/sampling_logp_difference/mean": 0.08250489830970764,
"step": 653
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.14983681589365005,
"epoch": 1.7210526315789474,
"grad_norm": 0.03017502650618553,
"learning_rate": 1e-06,
"loss": 0.0162,
"step": 654
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.14353607594966888,
"epoch": 1.723684210526316,
"grad_norm": 0.011741744354367256,
"learning_rate": 1e-06,
"loss": 0.0456,
"step": 655
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.14645575731992722,
"epoch": 1.7263157894736842,
"grad_norm": 0.0143356928601861,
"learning_rate": 1e-06,
"loss": 0.0101,
"step": 656
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4858.0,
"completions/mean_length": 808.5078125,
"completions/mean_terminated_length": 623.8182373046875,
"completions/min_length": 213.0,
"completions/min_terminated_length": 213.0,
"entropy": 0.14062481373548508,
"epoch": 1.7289473684210526,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.03301079943776131,
"learning_rate": 1e-06,
"loss": 0.0232,
"num_tokens": 202161115.0,
"reward": 0.7795208692550659,
"reward_std": 0.21248871088027954,
"rewards/progression_diversity/mean": -0.0020201844163239002,
"rewards/progression_diversity/std": 0.022819824516773224,
"rewards/symbolic_reward_accuracy/mean": 0.853515625,
"rewards/symbolic_reward_accuracy/std": 0.35393697023391724,
"rewards/symbolic_reward_partial_score/mean": 0.8953450322151184,
"rewards/symbolic_reward_partial_score/std": 0.28262069821357727,
"rewards/tag_count_reward/mean": -0.01171875,
"rewards/tag_count_reward/std": 0.10772226005792618,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0237058401107788,
"sampling/importance_sampling_ratio/min": 4.2048032611319286e-08,
"sampling/sampling_logp_difference/max": 16.984453201293945,
"sampling/sampling_logp_difference/mean": 0.07847477495670319,
"step": 657
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.14044133573770523,
"epoch": 1.731578947368421,
"grad_norm": 0.013989591039717197,
"learning_rate": 1e-06,
"loss": 0.0237,
"step": 658
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.1442088633775711,
"epoch": 1.7342105263157894,
"grad_norm": 0.014133965596556664,
"learning_rate": 1e-06,
"loss": 0.0343,
"step": 659
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.14218034595251083,
"epoch": 1.736842105263158,
"grad_norm": 0.01604565605521202,
"learning_rate": 1e-06,
"loss": 0.0263,
"step": 660
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7880.0,
"completions/mean_length": 708.55078125,
"completions/mean_terminated_length": 616.1611328125,
"completions/min_length": 261.0,
"completions/min_terminated_length": 261.0,
"entropy": 0.14530527591705322,
"epoch": 1.7394736842105263,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.022363541647791862,
"learning_rate": 1e-06,
"loss": 0.0011,
"num_tokens": 202912181.0,
"reward": 0.8098558187484741,
"reward_std": 0.1898079812526703,
"rewards/progression_diversity/mean": -0.0007470193086192012,
"rewards/progression_diversity/std": 0.011871008202433586,
"rewards/symbolic_reward_accuracy/mean": 0.888671875,
"rewards/symbolic_reward_accuracy/std": 0.31484565138816833,
"rewards/symbolic_reward_partial_score/mean": 0.9241536855697632,
"rewards/symbolic_reward_partial_score/std": 0.24054944515228271,
"rewards/tag_count_reward/mean": -0.005859375,
"rewards/tag_count_reward/std": 0.07639661431312561,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0253888368606567,
"sampling/importance_sampling_ratio/min": 3.482146238020789e-10,
"sampling/sampling_logp_difference/max": 21.778202056884766,
"sampling/sampling_logp_difference/mean": 0.08183024823665619,
"step": 661
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.1518562138080597,
"epoch": 1.7421052631578946,
"grad_norm": 0.020019719377160072,
"learning_rate": 1e-06,
"loss": 0.0076,
"step": 662
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.14416959136724472,
"epoch": 1.7447368421052631,
"grad_norm": 0.04639910161495209,
"learning_rate": 1e-06,
"loss": 0.0314,
"step": 663
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.1428115963935852,
"epoch": 1.7473684210526317,
"grad_norm": 0.011411590501666069,
"learning_rate": 1e-06,
"loss": 0.0255,
"step": 664
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.044921875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8516.0,
"completions/mean_length": 1431.109375,
"completions/mean_terminated_length": 727.8036499023438,
"completions/min_length": 252.0,
"completions/min_terminated_length": 252.0,
"entropy": 0.13438043743371964,
"epoch": 1.75,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.026513205841183662,
"learning_rate": 1e-06,
"loss": 0.0408,
"num_tokens": 204042541.0,
"reward": 0.6865507364273071,
"reward_std": 0.19577746093273163,
"rewards/progression_diversity/mean": -0.007034477777779102,
"rewards/progression_diversity/std": 0.0463702417910099,
"rewards/symbolic_reward_accuracy/mean": 0.732421875,
"rewards/symbolic_reward_accuracy/std": 0.4431293308734894,
"rewards/symbolic_reward_partial_score/mean": 0.8362630605697632,
"rewards/symbolic_reward_partial_score/std": 0.3250221014022827,
"rewards/tag_count_reward/mean": -0.037109375,
"rewards/tag_count_reward/std": 0.18921469151973724,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.02474045753479,
"sampling/importance_sampling_ratio/min": 1.1017795123480725e-17,
"sampling/sampling_logp_difference/max": 39.047019958496094,
"sampling/sampling_logp_difference/mean": 0.07735016196966171,
"step": 665
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.13546551764011383,
"epoch": 1.7526315789473683,
"grad_norm": 0.0192271675914526,
"learning_rate": 1e-06,
"loss": 0.0348,
"step": 666
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.13766096532344818,
"epoch": 1.7552631578947369,
"grad_norm": 0.017237944528460503,
"learning_rate": 1e-06,
"loss": 0.0368,
"step": 667
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.1953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.13993556797504425,
"epoch": 1.7578947368421054,
"grad_norm": 0.02463732473552227,
"learning_rate": 1e-06,
"loss": 0.0289,
"step": 668
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8065.0,
"completions/mean_length": 977.833984375,
"completions/mean_terminated_length": 670.9382934570312,
"completions/min_length": 252.0,
"completions/min_terminated_length": 252.0,
"entropy": 0.14678215980529785,
"epoch": 1.7605263157894737,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.023546183481812477,
"learning_rate": 1e-06,
"loss": -0.0008,
"num_tokens": 204941848.0,
"reward": 0.7797601222991943,
"reward_std": 0.18229469656944275,
"rewards/progression_diversity/mean": -0.002507382072508335,
"rewards/progression_diversity/std": 0.025364823639392853,
"rewards/symbolic_reward_accuracy/mean": 0.84765625,
"rewards/symbolic_reward_accuracy/std": 0.35970520973205566,
"rewards/symbolic_reward_partial_score/mean": 0.9091796875,
"rewards/symbolic_reward_partial_score/std": 0.2537998855113983,
"rewards/tag_count_reward/mean": -0.015625,
"rewards/tag_count_reward/std": 0.12414088100194931,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0251445770263672,
"sampling/importance_sampling_ratio/min": 1.523035741968215e-08,
"sampling/sampling_logp_difference/max": 17.999975204467773,
"sampling/sampling_logp_difference/mean": 0.0789584219455719,
"step": 669
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.1442192792892456,
"epoch": 1.763157894736842,
"grad_norm": 0.020266570150852203,
"learning_rate": 1e-06,
"loss": 0.0654,
"step": 670
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.14410430938005447,
"epoch": 1.7657894736842106,
"grad_norm": 0.00926806777715683,
"learning_rate": 1e-06,
"loss": 0.0179,
"step": 671
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.14117338508367538,
"epoch": 1.768421052631579,
"grad_norm": 0.008366767317056656,
"learning_rate": 1e-06,
"loss": 0.0513,
"step": 672
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3605.0,
"completions/mean_length": 996.298828125,
"completions/mean_terminated_length": 658.4451293945312,
"completions/min_length": 244.0,
"completions/min_terminated_length": 244.0,
"entropy": 0.14415088295936584,
"epoch": 1.7710526315789474,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.023884868249297142,
"learning_rate": 1e-06,
"loss": 0.0597,
"num_tokens": 205849745.0,
"reward": 0.8204151391983032,
"reward_std": 0.13469088077545166,
"rewards/progression_diversity/mean": -0.0043905326165258884,
"rewards/progression_diversity/std": 0.046648427844047546,
"rewards/symbolic_reward_accuracy/mean": 0.8984375,
"rewards/symbolic_reward_accuracy/std": 0.30236753821372986,
"rewards/symbolic_reward_partial_score/mean": 0.9431966543197632,
"rewards/symbolic_reward_partial_score/std": 0.2042941004037857,
"rewards/tag_count_reward/mean": -0.015625,
"rewards/tag_count_reward/std": 0.12414088100194931,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0248339176177979,
"sampling/importance_sampling_ratio/min": 3.4787309033390557e-08,
"sampling/sampling_logp_difference/max": 17.174013137817383,
"sampling/sampling_logp_difference/mean": 0.07455458492040634,
"step": 673
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.13898008316755295,
"epoch": 1.7736842105263158,
"grad_norm": 0.029918354004621506,
"learning_rate": 1e-06,
"loss": 0.0328,
"step": 674
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.14073319733142853,
"epoch": 1.776315789473684,
"grad_norm": 0.01222238689661026,
"learning_rate": 1e-06,
"loss": 0.0455,
"step": 675
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.13992726802825928,
"epoch": 1.7789473684210526,
"grad_norm": 0.00996064767241478,
"learning_rate": 1e-06,
"loss": 0.046,
"step": 676
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 6681.0,
"completions/mean_length": 1095.326171875,
"completions/mean_terminated_length": 728.3980102539062,
"completions/min_length": 263.0,
"completions/min_terminated_length": 263.0,
"entropy": 0.14075642824172974,
"epoch": 1.7815789473684212,
"frac_reward_zero_std": 0.21875,
"grad_norm": 0.03422519192099571,
"learning_rate": 1e-06,
"loss": 0.0121,
"num_tokens": 206820696.0,
"reward": 0.7328424453735352,
"reward_std": 0.20037183165550232,
"rewards/progression_diversity/mean": -0.0018944772891700268,
"rewards/progression_diversity/std": 0.018759803846478462,
"rewards/symbolic_reward_accuracy/mean": 0.783203125,
"rewards/symbolic_reward_accuracy/std": 0.4124660789966583,
"rewards/symbolic_reward_partial_score/mean": 0.8836262822151184,
"rewards/symbolic_reward_partial_score/std": 0.27410218119621277,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0241965055465698,
"sampling/importance_sampling_ratio/min": 8.016753927364562e-11,
"sampling/sampling_logp_difference/max": 23.246902465820312,
"sampling/sampling_logp_difference/mean": 0.07802345603704453,
"step": 677
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.1405147835612297,
"epoch": 1.7842105263157895,
"grad_norm": 0.01229146309196949,
"learning_rate": 1e-06,
"loss": 0.0232,
"step": 678
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.46875,
"entropy": 0.13932440429925919,
"epoch": 1.7868421052631578,
"grad_norm": 0.01690313220024109,
"learning_rate": 1e-06,
"loss": 0.0334,
"step": 679
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3671875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.14134883880615234,
"epoch": 1.7894736842105263,
"grad_norm": 0.03306613117456436,
"learning_rate": 1e-06,
"loss": 0.0357,
"step": 680
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14676.0,
"completions/mean_length": 1426.642578125,
"completions/mean_terminated_length": 755.0877075195312,
"completions/min_length": 245.0,
"completions/min_terminated_length": 245.0,
"entropy": 0.13857001811265945,
"epoch": 1.7921052631578949,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.03711333125829697,
"learning_rate": 1e-06,
"loss": 0.0339,
"num_tokens": 207951297.0,
"reward": 0.7199710607528687,
"reward_std": 0.15880054235458374,
"rewards/progression_diversity/mean": -0.00485343299806118,
"rewards/progression_diversity/std": 0.03532535955309868,
"rewards/symbolic_reward_accuracy/mean": 0.775390625,
"rewards/symbolic_reward_accuracy/std": 0.41773295402526855,
"rewards/symbolic_reward_partial_score/mean": 0.8616536855697632,
"rewards/symbolic_reward_partial_score/std": 0.30703648924827576,
"rewards/tag_count_reward/mean": -0.037109375,
"rewards/tag_count_reward/std": 0.18921469151973724,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0234718322753906,
"sampling/importance_sampling_ratio/min": 3.384801416927985e-08,
"sampling/sampling_logp_difference/max": 17.201385498046875,
"sampling/sampling_logp_difference/mean": 0.07443863153457642,
"step": 681
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.13849762827157974,
"epoch": 1.7947368421052632,
"grad_norm": 0.017198480665683746,
"learning_rate": 1e-06,
"loss": 0.0603,
"step": 682
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.13620536029338837,
"epoch": 1.7973684210526315,
"grad_norm": 0.024120958521962166,
"learning_rate": 1e-06,
"loss": 0.0262,
"step": 683
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.13732768595218658,
"epoch": 1.8,
"grad_norm": 0.031364087015390396,
"learning_rate": 1e-06,
"loss": 0.0579,
"step": 684
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.013671875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5863.0,
"completions/mean_length": 877.501953125,
"completions/mean_terminated_length": 662.5604248046875,
"completions/min_length": 266.0,
"completions/min_terminated_length": 266.0,
"entropy": 0.147243432700634,
"epoch": 1.8026315789473686,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.018228838220238686,
"learning_rate": 1e-06,
"loss": 0.0146,
"num_tokens": 208811746.0,
"reward": 0.7966721653938293,
"reward_std": 0.1374010145664215,
"rewards/progression_diversity/mean": -0.0007563849212601781,
"rewards/progression_diversity/std": 0.012015795335173607,
"rewards/symbolic_reward_accuracy/mean": 0.87109375,
"rewards/symbolic_reward_accuracy/std": 0.33542385697364807,
"rewards/symbolic_reward_partial_score/mean": 0.9166666269302368,
"rewards/symbolic_reward_partial_score/std": 0.2510036528110504,
"rewards/tag_count_reward/mean": -0.009765625,
"rewards/tag_count_reward/std": 0.09843364357948303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0271267890930176,
"sampling/importance_sampling_ratio/min": 4.275814262655331e-07,
"sampling/sampling_logp_difference/max": 14.665121078491211,
"sampling/sampling_logp_difference/mean": 0.08050627261400223,
"step": 685
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.14790859073400497,
"epoch": 1.805263157894737,
"grad_norm": 0.019763531163334846,
"learning_rate": 1e-06,
"loss": 0.0097,
"step": 686
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.15091699361801147,
"epoch": 1.8078947368421052,
"grad_norm": 0.018503857776522636,
"learning_rate": 1e-06,
"loss": 0.0235,
"step": 687
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.15277040749788284,
"epoch": 1.8105263157894735,
"grad_norm": 0.014150720089673996,
"learning_rate": 1e-06,
"loss": 0.0199,
"step": 688
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009765625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9039.0,
"completions/mean_length": 803.11328125,
"completions/mean_terminated_length": 649.4556274414062,
"completions/min_length": 254.0,
"completions/min_terminated_length": 254.0,
"entropy": 0.16078810393810272,
"epoch": 1.813157894736842,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.019453125074505806,
"learning_rate": 1e-06,
"loss": -0.0026,
"num_tokens": 209626588.0,
"reward": 0.838794469833374,
"reward_std": 0.13540925085544586,
"rewards/progression_diversity/mean": -0.0023899937514215708,
"rewards/progression_diversity/std": 0.026318540796637535,
"rewards/symbolic_reward_accuracy/mean": 0.919921875,
"rewards/symbolic_reward_accuracy/std": 0.271679550409317,
"rewards/symbolic_reward_partial_score/mean": 0.9581705927848816,
"rewards/symbolic_reward_partial_score/std": 0.17204301059246063,
"rewards/tag_count_reward/mean": -0.005859375,
"rewards/tag_count_reward/std": 0.07639661431312561,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0289819240570068,
"sampling/importance_sampling_ratio/min": 5.673148734786082e-06,
"sampling/sampling_logp_difference/max": 12.079766273498535,
"sampling/sampling_logp_difference/mean": 0.08512907475233078,
"step": 689
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.15033067017793655,
"epoch": 1.8157894736842106,
"grad_norm": 0.053703587502241135,
"learning_rate": 1e-06,
"loss": 0.028,
"step": 690
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.16108743101358414,
"epoch": 1.818421052631579,
"grad_norm": 0.012639776803553104,
"learning_rate": 1e-06,
"loss": 0.0456,
"step": 691
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0546875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1015625,
"entropy": 0.1631702333688736,
"epoch": 1.8210526315789473,
"grad_norm": 0.009674533270299435,
"learning_rate": 1e-06,
"loss": 0.0075,
"step": 692
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13431.0,
"completions/mean_length": 1337.83984375,
"completions/mean_terminated_length": 726.207275390625,
"completions/min_length": 230.0,
"completions/min_terminated_length": 230.0,
"entropy": 0.15106821060180664,
"epoch": 1.8236842105263158,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.027334807440638542,
"learning_rate": 1e-06,
"loss": 0.0295,
"num_tokens": 210713706.0,
"reward": 0.7495685815811157,
"reward_std": 0.16618216037750244,
"rewards/progression_diversity/mean": -0.0040826634503901005,
"rewards/progression_diversity/std": 0.031112631782889366,
"rewards/symbolic_reward_accuracy/mean": 0.8203125,
"rewards/symbolic_reward_accuracy/std": 0.38430243730545044,
"rewards/symbolic_reward_partial_score/mean": 0.8671875,
"rewards/symbolic_reward_partial_score/std": 0.31214237213134766,
"rewards/tag_count_reward/mean": -0.02734375,
"rewards/tag_count_reward/std": 0.16324250400066376,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0262157917022705,
"sampling/importance_sampling_ratio/min": 1.5291009569651237e-09,
"sampling/sampling_logp_difference/max": 20.298585891723633,
"sampling/sampling_logp_difference/mean": 0.07850104570388794,
"step": 693
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.15497365593910217,
"epoch": 1.8263157894736843,
"grad_norm": 0.018270188942551613,
"learning_rate": 1e-06,
"loss": 0.0111,
"step": 694
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0546875,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.1553620547056198,
"epoch": 1.8289473684210527,
"grad_norm": 0.009259329177439213,
"learning_rate": 1e-06,
"loss": 0.0377,
"step": 695
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0625,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.14360713958740234,
"epoch": 1.831578947368421,
"grad_norm": 0.010323197580873966,
"learning_rate": 1e-06,
"loss": 0.0499,
"step": 696
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11742.0,
"completions/mean_length": 1135.07421875,
"completions/mean_terminated_length": 674.8450317382812,
"completions/min_length": 219.0,
"completions/min_terminated_length": 219.0,
"entropy": 0.1573568657040596,
"epoch": 1.8342105263157895,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.03801272064447403,
"learning_rate": 1e-06,
"loss": 0.0393,
"num_tokens": 211690672.0,
"reward": 0.7663776874542236,
"reward_std": 0.2096462845802307,
"rewards/progression_diversity/mean": -0.007742568850517273,
"rewards/progression_diversity/std": 0.055101774632930756,
"rewards/symbolic_reward_accuracy/mean": 0.8359375,
"rewards/symbolic_reward_accuracy/std": 0.37069445848464966,
"rewards/symbolic_reward_partial_score/mean": 0.88818359375,
"rewards/symbolic_reward_partial_score/std": 0.2851507365703583,
"rewards/tag_count_reward/mean": -0.015625,
"rewards/tag_count_reward/std": 0.12414088100194931,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0286474227905273,
"sampling/importance_sampling_ratio/min": 1.8554568725903664e-07,
"sampling/sampling_logp_difference/max": 15.499964714050293,
"sampling/sampling_logp_difference/mean": 0.08280766010284424,
"step": 697
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.15686368942260742,
"epoch": 1.836842105263158,
"grad_norm": 0.022794852033257484,
"learning_rate": 1e-06,
"loss": 0.0325,
"step": 698
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.1601133495569229,
"epoch": 1.8394736842105264,
"grad_norm": 0.009969050996005535,
"learning_rate": 1e-06,
"loss": 0.0541,
"step": 699
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.16635262221097946,
"epoch": 1.8421052631578947,
"grad_norm": 0.016408884897828102,
"learning_rate": 1e-06,
"loss": 0.0137,
"step": 700
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017578125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7094.0,
"completions/mean_length": 898.439453125,
"completions/mean_terminated_length": 621.36181640625,
"completions/min_length": 204.0,
"completions/min_terminated_length": 204.0,
"entropy": 0.16016201674938202,
"epoch": 1.844736842105263,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.06322816014289856,
"learning_rate": 1e-06,
"loss": 0.0253,
"num_tokens": 212557809.0,
"reward": 0.7900543212890625,
"reward_std": 0.2123447060585022,
"rewards/progression_diversity/mean": -0.003359610214829445,
"rewards/progression_diversity/std": 0.036421939730644226,
"rewards/symbolic_reward_accuracy/mean": 0.865234375,
"rewards/symbolic_reward_accuracy/std": 0.3418070077896118,
"rewards/symbolic_reward_partial_score/mean": 0.9064127206802368,
"rewards/symbolic_reward_partial_score/std": 0.2696720063686371,
"rewards/tag_count_reward/mean": -0.009765625,
"rewards/tag_count_reward/std": 0.09843364357948303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0269652605056763,
"sampling/importance_sampling_ratio/min": 2.2314340597517912e-08,
"sampling/sampling_logp_difference/max": 17.6180362701416,
"sampling/sampling_logp_difference/mean": 0.0807926282286644,
"step": 701
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.15802810341119766,
"epoch": 1.8473684210526315,
"grad_norm": 0.015205918811261654,
"learning_rate": 1e-06,
"loss": 0.0352,
"step": 702
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.16626133769750595,
"epoch": 1.85,
"grad_norm": 0.01964223012328148,
"learning_rate": 1e-06,
"loss": 0.045,
"step": 703
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.078125,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.1609407141804695,
"epoch": 1.8526315789473684,
"grad_norm": 0.01399518083781004,
"learning_rate": 1e-06,
"loss": 0.0803,
"step": 704
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11244.0,
"completions/mean_length": 1886.1796875,
"completions/mean_terminated_length": 657.5508422851562,
"completions/min_length": 230.0,
"completions/min_terminated_length": 230.0,
"entropy": 0.15396300703287125,
"epoch": 1.8552631578947367,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.04543311893939972,
"learning_rate": 1e-06,
"loss": 0.0924,
"num_tokens": 213938029.0,
"reward": 0.7416077852249146,
"reward_std": 0.23779910802841187,
"rewards/progression_diversity/mean": -0.014027466997504234,
"rewards/progression_diversity/std": 0.06849034875631332,
"rewards/symbolic_reward_accuracy/mean": 0.80859375,
"rewards/symbolic_reward_accuracy/std": 0.3937928080558777,
"rewards/symbolic_reward_partial_score/mean": 0.86376953125,
"rewards/symbolic_reward_partial_score/std": 0.31700682640075684,
"rewards/tag_count_reward/mean": -0.025390625,
"rewards/tag_count_reward/std": 0.15746226906776428,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0203101634979248,
"sampling/importance_sampling_ratio/min": 1.0977701947467722e-07,
"sampling/sampling_logp_difference/max": 16.02481460571289,
"sampling/sampling_logp_difference/mean": 0.06179344281554222,
"step": 705
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.1514749750494957,
"epoch": 1.8578947368421053,
"grad_norm": 0.024540159851312637,
"learning_rate": 1e-06,
"loss": 0.0743,
"step": 706
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.15695882588624954,
"epoch": 1.8605263157894738,
"grad_norm": 0.01950952038168907,
"learning_rate": 1e-06,
"loss": 0.0336,
"step": 707
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.078125,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.13617245852947235,
"epoch": 1.8631578947368421,
"grad_norm": 0.031919222325086594,
"learning_rate": 1e-06,
"loss": 0.1621,
"step": 708
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.146484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5933.0,
"completions/mean_length": 2955.310546875,
"completions/mean_terminated_length": 650.6155395507812,
"completions/min_length": 255.0,
"completions/min_terminated_length": 255.0,
"entropy": 0.1501893624663353,
"epoch": 1.8657894736842104,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.027009177953004837,
"learning_rate": 1e-06,
"loss": 0.1153,
"num_tokens": 215862796.0,
"reward": 0.6578521728515625,
"reward_std": 0.2280564308166504,
"rewards/progression_diversity/mean": -0.025331620126962662,
"rewards/progression_diversity/std": 0.08841913938522339,
"rewards/symbolic_reward_accuracy/mean": 0.7109375,
"rewards/symbolic_reward_accuracy/std": 0.45377036929130554,
"rewards/symbolic_reward_partial_score/mean": 0.8011067509651184,
"rewards/symbolic_reward_partial_score/std": 0.3672308921813965,
"rewards/tag_count_reward/mean": -0.087890625,
"rewards/tag_count_reward/std": 0.2834126651287079,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0195657014846802,
"sampling/importance_sampling_ratio/min": 1.3204642934638855e-10,
"sampling/sampling_logp_difference/max": 22.747867584228516,
"sampling/sampling_logp_difference/mean": 0.05873488262295723,
"step": 709
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.453125,
"entropy": 0.14868657290935516,
"epoch": 1.868421052631579,
"grad_norm": 0.06350129097700119,
"learning_rate": 1e-06,
"loss": 0.1039,
"step": 710
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.21875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4765625,
"entropy": 0.1268022656440735,
"epoch": 1.8710526315789475,
"grad_norm": 0.012040435336530209,
"learning_rate": 1e-06,
"loss": 0.2382,
"step": 711
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.14944910258054733,
"epoch": 1.8736842105263158,
"grad_norm": 0.042405128479003906,
"learning_rate": 1e-06,
"loss": 0.0774,
"step": 712
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14478.0,
"completions/mean_length": 2155.14453125,
"completions/mean_terminated_length": 615.22509765625,
"completions/min_length": 199.0,
"completions/min_terminated_length": 199.0,
"entropy": 0.14173594117164612,
"epoch": 1.8763157894736842,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.10443775355815887,
"learning_rate": 1e-06,
"loss": 0.135,
"num_tokens": 217349846.0,
"reward": 0.722713828086853,
"reward_std": 0.21990327537059784,
"rewards/progression_diversity/mean": -0.028423257172107697,
"rewards/progression_diversity/std": 0.10595274716615677,
"rewards/symbolic_reward_accuracy/mean": 0.79296875,
"rewards/symbolic_reward_accuracy/std": 0.40557438135147095,
"rewards/symbolic_reward_partial_score/mean": 0.8435872197151184,
"rewards/symbolic_reward_partial_score/std": 0.33945873379707336,
"rewards/tag_count_reward/mean": -0.05859375,
"rewards/tag_count_reward/std": 0.23509246110916138,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0188918113708496,
"sampling/importance_sampling_ratio/min": 1.1887545392497145e-09,
"sampling/sampling_logp_difference/max": 20.55035972595215,
"sampling/sampling_logp_difference/mean": 0.059576526284217834,
"step": 713
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0546875,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.1623135283589363,
"epoch": 1.8789473684210525,
"grad_norm": 0.030199255794286728,
"learning_rate": 1e-06,
"loss": 0.1038,
"step": 714
},
{
"clip_ratio/high_max": 0.25,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.140625,
"entropy": 0.1539328470826149,
"epoch": 1.881578947368421,
"grad_norm": 0.028396856039762497,
"learning_rate": 1e-06,
"loss": 0.0772,
"step": 715
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.03125,
"clip_ratio/low_mean": 0.2109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.14560812711715698,
"epoch": 1.8842105263157896,
"grad_norm": 0.03335552662611008,
"learning_rate": 1e-06,
"loss": 0.1204,
"step": 716
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.130859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13825.0,
"completions/mean_length": 2713.19140625,
"completions/mean_terminated_length": 654.889892578125,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"entropy": 0.1502734199166298,
"epoch": 1.8868421052631579,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.0449819378554821,
"learning_rate": 1e-06,
"loss": 0.1002,
"num_tokens": 219142648.0,
"reward": 0.6932658553123474,
"reward_std": 0.27170705795288086,
"rewards/progression_diversity/mean": -0.033768799155950546,
"rewards/progression_diversity/std": 0.11512268334627151,
"rewards/symbolic_reward_accuracy/mean": 0.763671875,
"rewards/symbolic_reward_accuracy/std": 0.42524150013923645,
"rewards/symbolic_reward_partial_score/mean": 0.8133138418197632,
"rewards/symbolic_reward_partial_score/std": 0.36641624569892883,
"rewards/tag_count_reward/mean": -0.0859375,
"rewards/tag_count_reward/std": 0.28054583072662354,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0132670402526855,
"sampling/importance_sampling_ratio/min": 2.8605501180398087e-15,
"sampling/sampling_logp_difference/max": 33.487762451171875,
"sampling/sampling_logp_difference/mean": 0.046033672988414764,
"step": 717
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4609375,
"entropy": 0.1434255838394165,
"epoch": 1.8894736842105262,
"grad_norm": 0.020671065896749496,
"learning_rate": 1e-06,
"loss": 0.1992,
"step": 718
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.5,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.671875,
"entropy": 0.12531127035617828,
"epoch": 1.8921052631578947,
"grad_norm": 0.022249706089496613,
"learning_rate": 1e-06,
"loss": 0.2442,
"step": 719
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.4453125,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5859375,
"entropy": 0.15144315361976624,
"epoch": 1.8947368421052633,
"grad_norm": 0.03603983670473099,
"learning_rate": 1e-06,
"loss": 0.1141,
"step": 720
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14424.0,
"completions/mean_length": 2154.701171875,
"completions/mean_terminated_length": 614.7337646484375,
"completions/min_length": 197.0,
"completions/min_terminated_length": 197.0,
"entropy": 0.1503245010972023,
"epoch": 1.8973684210526316,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.14981180429458618,
"learning_rate": 1e-06,
"loss": 0.2187,
"num_tokens": 220655679.0,
"reward": 0.7320119738578796,
"reward_std": 0.22837099432945251,
"rewards/progression_diversity/mean": -0.021462373435497284,
"rewards/progression_diversity/std": 0.08626196533441544,
"rewards/symbolic_reward_accuracy/mean": 0.802734375,
"rewards/symbolic_reward_accuracy/std": 0.3983237147331238,
"rewards/symbolic_reward_partial_score/mean": 0.8561197519302368,
"rewards/symbolic_reward_partial_score/std": 0.3233526647090912,
"rewards/tag_count_reward/mean": -0.0625,
"rewards/tag_count_reward/std": 0.2422981858253479,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.019932508468628,
"sampling/importance_sampling_ratio/min": 4.195763745121206e-13,
"sampling/sampling_logp_difference/max": 28.499530792236328,
"sampling/sampling_logp_difference/mean": 0.06518127024173737,
"step": 721
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0234375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.17081984877586365,
"epoch": 1.9,
"grad_norm": 0.023720135912299156,
"learning_rate": 1e-06,
"loss": 0.0536,
"step": 722
},
{
"clip_ratio/high_max": 0.25,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.16384224593639374,
"epoch": 1.9026315789473685,
"grad_norm": 0.034469299018383026,
"learning_rate": 1e-06,
"loss": 0.1537,
"step": 723
},
{
"clip_ratio/high_max": 0.25,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.16798613965511322,
"epoch": 1.905263157894737,
"grad_norm": 0.03507945314049721,
"learning_rate": 1e-06,
"loss": 0.0875,
"step": 724
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.126953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16240.0,
"completions/mean_length": 2634.119140625,
"completions/mean_terminated_length": 634.6957397460938,
"completions/min_length": 201.0,
"completions/min_terminated_length": 201.0,
"entropy": 0.15430939197540283,
"epoch": 1.9078947368421053,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.09424492716789246,
"learning_rate": 1e-06,
"loss": 0.1251,
"num_tokens": 222400316.0,
"reward": 0.671546220779419,
"reward_std": 0.3024430274963379,
"rewards/progression_diversity/mean": -0.03776249662041664,
"rewards/progression_diversity/std": 0.12152393907308578,
"rewards/symbolic_reward_accuracy/mean": 0.732421875,
"rewards/symbolic_reward_accuracy/std": 0.4431293308734894,
"rewards/symbolic_reward_partial_score/mean": 0.79638671875,
"rewards/symbolic_reward_partial_score/std": 0.3773951828479767,
"rewards/tag_count_reward/mean": -0.064453125,
"rewards/tag_count_reward/std": 0.24579854309558868,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0167055130004883,
"sampling/importance_sampling_ratio/min": 6.6573236350974275e-15,
"sampling/sampling_logp_difference/max": 32.64305877685547,
"sampling/sampling_logp_difference/mean": 0.05589202791452408,
"step": 725
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.25,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.14185352623462677,
"epoch": 1.9105263157894736,
"grad_norm": 0.06517547369003296,
"learning_rate": 1e-06,
"loss": 0.2122,
"step": 726
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4765625,
"entropy": 0.15953659266233444,
"epoch": 1.913157894736842,
"grad_norm": 0.022468971088528633,
"learning_rate": 1e-06,
"loss": 0.1739,
"step": 727
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.4921875,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.640625,
"entropy": 0.1624743863940239,
"epoch": 1.9157894736842105,
"grad_norm": 0.009907384403049946,
"learning_rate": 1e-06,
"loss": 0.0998,
"step": 728
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.166015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15558.0,
"completions/mean_length": 3540.01953125,
"completions/mean_terminated_length": 983.2552490234375,
"completions/min_length": 199.0,
"completions/min_terminated_length": 199.0,
"entropy": 0.1347077265381813,
"epoch": 1.918421052631579,
"frac_reward_zero_std": 0.09375,
"grad_norm": 0.13737860321998596,
"learning_rate": 1e-06,
"loss": 0.2226,
"num_tokens": 224636646.0,
"reward": 0.5830076932907104,
"reward_std": 0.30949866771698,
"rewards/progression_diversity/mean": -0.048849739134311676,
"rewards/progression_diversity/std": 0.1297674924135208,
"rewards/symbolic_reward_accuracy/mean": 0.630859375,
"rewards/symbolic_reward_accuracy/std": 0.4830440282821655,
"rewards/symbolic_reward_partial_score/mean": 0.7255859375,
"rewards/symbolic_reward_partial_score/std": 0.4099434018135071,
"rewards/tag_count_reward/mean": -0.126953125,
"rewards/tag_count_reward/std": 0.33324605226516724,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0149319171905518,
"sampling/importance_sampling_ratio/min": 4.369263012223404e-22,
"sampling/sampling_logp_difference/max": 49.18227767944336,
"sampling/sampling_logp_difference/mean": 0.04961749166250229,
"step": 729
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0390625,
"clip_ratio/low_mean": 0.3046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.15040332078933716,
"epoch": 1.9210526315789473,
"grad_norm": 0.059476714581251144,
"learning_rate": 1e-06,
"loss": 0.1366,
"step": 730
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.046875,
"clip_ratio/low_mean": 0.2890625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.15057525038719177,
"epoch": 1.9236842105263157,
"grad_norm": 0.06425424665212631,
"learning_rate": 1e-06,
"loss": 0.2423,
"step": 731
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.25,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.15043792128562927,
"epoch": 1.9263157894736842,
"grad_norm": 0.022221507504582405,
"learning_rate": 1e-06,
"loss": 0.2027,
"step": 732
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.087890625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14566.0,
"completions/mean_length": 2165.96875,
"completions/mean_terminated_length": 795.9229125976562,
"completions/min_length": 169.0,
"completions/min_terminated_length": 169.0,
"entropy": 0.17250816524028778,
"epoch": 1.9289473684210527,
"frac_reward_zero_std": 0.09375,
"grad_norm": 0.7282646298408508,
"learning_rate": 1e-06,
"loss": 0.1429,
"num_tokens": 226135254.0,
"reward": 0.6728378534317017,
"reward_std": 0.29266250133514404,
"rewards/progression_diversity/mean": -0.030671168118715286,
"rewards/progression_diversity/std": 0.11537665873765945,
"rewards/symbolic_reward_accuracy/mean": 0.724609375,
"rewards/symbolic_reward_accuracy/std": 0.44714778661727905,
"rewards/symbolic_reward_partial_score/mean": 0.8147786259651184,
"rewards/symbolic_reward_partial_score/std": 0.35132884979248047,
"rewards/tag_count_reward/mean": -0.060546875,
"rewards/tag_count_reward/std": 0.2387305200099945,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0237343311309814,
"sampling/importance_sampling_ratio/min": 3.73120688677813e-20,
"sampling/sampling_logp_difference/max": 44.73497009277344,
"sampling/sampling_logp_difference/mean": 0.07423095405101776,
"step": 733
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.18838734924793243,
"epoch": 1.931578947368421,
"grad_norm": 0.019116874784231186,
"learning_rate": 1e-06,
"loss": 0.1143,
"step": 734
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.1953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4765625,
"entropy": 0.17143803089857101,
"epoch": 1.9342105263157894,
"grad_norm": 0.1997663974761963,
"learning_rate": 1e-06,
"loss": 0.1573,
"step": 735
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.1800050213932991,
"epoch": 1.936842105263158,
"grad_norm": 0.02649916335940361,
"learning_rate": 1e-06,
"loss": 0.0814,
"step": 736
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.103515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15817.0,
"completions/mean_length": 2609.44921875,
"completions/mean_terminated_length": 1018.9237060546875,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"entropy": 0.1816265881061554,
"epoch": 1.9394736842105265,
"frac_reward_zero_std": 0.09375,
"grad_norm": 0.5376126170158386,
"learning_rate": 1e-06,
"loss": 0.1134,
"num_tokens": 227868924.0,
"reward": 0.6405581831932068,
"reward_std": 0.2928628921508789,
"rewards/progression_diversity/mean": -0.04574853554368019,
"rewards/progression_diversity/std": 0.13723419606685638,
"rewards/symbolic_reward_accuracy/mean": 0.69921875,
"rewards/symbolic_reward_accuracy/std": 0.45904624462127686,
"rewards/symbolic_reward_partial_score/mean": 0.7682291269302368,
"rewards/symbolic_reward_partial_score/std": 0.39795729517936707,
"rewards/tag_count_reward/mean": -0.08984375,
"rewards/tag_count_reward/std": 0.2862374484539032,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0221675634384155,
"sampling/importance_sampling_ratio/min": 5.437771195999072e-18,
"sampling/sampling_logp_difference/max": 39.7531623840332,
"sampling/sampling_logp_difference/mean": 0.06859034299850464,
"step": 737
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.2578125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.18071607500314713,
"epoch": 1.9421052631578948,
"grad_norm": 0.05956464633345604,
"learning_rate": 1e-06,
"loss": 0.1105,
"step": 738
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.2734375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.16678573191165924,
"epoch": 1.944736842105263,
"grad_norm": 0.16846124827861786,
"learning_rate": 1e-06,
"loss": 0.1718,
"step": 739
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.2578125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.17628037184476852,
"epoch": 1.9473684210526314,
"grad_norm": 0.04649341478943825,
"learning_rate": 1e-06,
"loss": 0.1492,
"step": 740
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16126.0,
"completions/mean_length": 2970.525390625,
"completions/mean_terminated_length": 1454.219482421875,
"completions/min_length": 173.0,
"completions/min_terminated_length": 173.0,
"entropy": 0.14562518894672394,
"epoch": 1.95,
"frac_reward_zero_std": 0.03125,
"grad_norm": 2.966646194458008,
"learning_rate": 1e-06,
"loss": 0.2426,
"num_tokens": 229805993.0,
"reward": 0.6170042157173157,
"reward_std": 0.3738676607608795,
"rewards/progression_diversity/mean": -0.07204228639602661,
"rewards/progression_diversity/std": 0.18077191710472107,
"rewards/symbolic_reward_accuracy/mean": 0.67578125,
"rewards/symbolic_reward_accuracy/std": 0.4685399830341339,
"rewards/symbolic_reward_partial_score/mean": 0.74072265625,
"rewards/symbolic_reward_partial_score/std": 0.41318923234939575,
"rewards/tag_count_reward/mean": -0.099609375,
"rewards/tag_count_reward/std": 0.29977133870124817,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.013148307800293,
"sampling/importance_sampling_ratio/min": 2.707424684788481e-31,
"sampling/sampling_logp_difference/max": 70.38414001464844,
"sampling/sampling_logp_difference/mean": 0.05534841865301132,
"step": 741
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.359375,
"clip_ratio/low_mean": 0.203125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5625,
"entropy": 0.17404460906982422,
"epoch": 1.9526315789473685,
"grad_norm": 0.013007045723497868,
"learning_rate": 1e-06,
"loss": 0.2101,
"step": 742
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.390625,
"clip_ratio/low_mean": 0.2265625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.6171875,
"entropy": 0.16387271136045456,
"epoch": 1.9552631578947368,
"grad_norm": 0.044528309255838394,
"learning_rate": 1e-06,
"loss": 0.1907,
"step": 743
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.5078125,
"clip_ratio/low_mean": 0.2109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.71875,
"entropy": 0.18834584951400757,
"epoch": 1.9578947368421051,
"grad_norm": 0.012452390044927597,
"learning_rate": 1e-06,
"loss": 0.1549,
"step": 744
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.103515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16311.0,
"completions/mean_length": 3217.857421875,
"completions/mean_terminated_length": 1697.5838623046875,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"entropy": 0.13727252185344696,
"epoch": 1.9605263157894737,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4300470352172852,
"learning_rate": 1e-06,
"loss": 0.2051,
"num_tokens": 231876032.0,
"reward": 0.619334876537323,
"reward_std": 0.37329334020614624,
"rewards/progression_diversity/mean": -0.07823123037815094,
"rewards/progression_diversity/std": 0.17627538740634918,
"rewards/symbolic_reward_accuracy/mean": 0.681640625,
"rewards/symbolic_reward_accuracy/std": 0.46629536151885986,
"rewards/symbolic_reward_partial_score/mean": 0.7376302480697632,
"rewards/symbolic_reward_partial_score/std": 0.4179321825504303,
"rewards/tag_count_reward/mean": -0.1015625,
"rewards/tag_count_reward/std": 0.30236753821372986,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.012220859527588,
"sampling/importance_sampling_ratio/min": 5.182242701661106e-28,
"sampling/sampling_logp_difference/max": 62.827144622802734,
"sampling/sampling_logp_difference/mean": 0.056160710752010345,
"step": 745
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.3046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.1606038585305214,
"epoch": 1.9631578947368422,
"grad_norm": 0.028117861598730087,
"learning_rate": 1e-06,
"loss": 0.2423,
"step": 746
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.03125,
"clip_ratio/low_mean": 0.2578125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.1709175780415535,
"epoch": 1.9657894736842105,
"grad_norm": 0.67425537109375,
"learning_rate": 1e-06,
"loss": 0.2054,
"step": 747
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.3125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.46875,
"entropy": 0.15508843958377838,
"epoch": 1.9684210526315788,
"grad_norm": 0.06400001794099808,
"learning_rate": 1e-06,
"loss": 0.2487,
"step": 748
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15746.0,
"completions/mean_length": 2000.880859375,
"completions/mean_terminated_length": 1293.5142822265625,
"completions/min_length": 139.0,
"completions/min_terminated_length": 139.0,
"entropy": 0.21451418101787567,
"epoch": 1.9710526315789474,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.9699931144714355,
"learning_rate": 1e-06,
"loss": 0.0579,
"num_tokens": 233257411.0,
"reward": 0.651739239692688,
"reward_std": 0.3094671964645386,
"rewards/progression_diversity/mean": -0.0653361827135086,
"rewards/progression_diversity/std": 0.19041091203689575,
"rewards/symbolic_reward_accuracy/mean": 0.7109375,
"rewards/symbolic_reward_accuracy/std": 0.45377036929130554,
"rewards/symbolic_reward_partial_score/mean": 0.76904296875,
"rewards/symbolic_reward_partial_score/std": 0.39429956674575806,
"rewards/tag_count_reward/mean": -0.048828125,
"rewards/tag_count_reward/std": 0.2157193273305893,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.021796703338623,
"sampling/importance_sampling_ratio/min": 8.134976418108564e-29,
"sampling/sampling_logp_difference/max": 64.67879486083984,
"sampling/sampling_logp_difference/mean": 0.07527394592761993,
"step": 749
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.5078125,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.640625,
"entropy": 0.17407967895269394,
"epoch": 1.973684210526316,
"grad_norm": 0.060622621327638626,
"learning_rate": 1e-06,
"loss": 0.1803,
"step": 750
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.5859375,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.7265625,
"entropy": 0.19498290121555328,
"epoch": 1.9763157894736842,
"grad_norm": 0.015352616086602211,
"learning_rate": 1e-06,
"loss": 0.181,
"step": 751
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.6875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.78125,
"entropy": 0.24272602796554565,
"epoch": 1.9789473684210526,
"grad_norm": 0.01805986650288105,
"learning_rate": 1e-06,
"loss": 0.0971,
"step": 752
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14517.0,
"completions/mean_length": 2323.900390625,
"completions/mean_terminated_length": 1448.79052734375,
"completions/min_length": 134.0,
"completions/min_terminated_length": 134.0,
"entropy": 0.18802518397569656,
"epoch": 1.981578947368421,
"frac_reward_zero_std": 0.09375,
"grad_norm": 1.341567039489746,
"learning_rate": 1e-06,
"loss": 0.1657,
"num_tokens": 234866896.0,
"reward": 0.6480075120925903,
"reward_std": 0.29000890254974365,
"rewards/progression_diversity/mean": -0.08694960176944733,
"rewards/progression_diversity/std": 0.21665886044502258,
"rewards/symbolic_reward_accuracy/mean": 0.70703125,
"rewards/symbolic_reward_accuracy/std": 0.455569326877594,
"rewards/symbolic_reward_partial_score/mean": 0.7762044072151184,
"rewards/symbolic_reward_partial_score/std": 0.3934388756752014,
"rewards/tag_count_reward/mean": -0.08203125,
"rewards/tag_count_reward/std": 0.2746807038784027,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.02323579788208,
"sampling/importance_sampling_ratio/min": 1.605486811297672e-22,
"sampling/sampling_logp_difference/max": 50.18344497680664,
"sampling/sampling_logp_difference/mean": 0.08397974073886871,
"step": 753
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.1953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.21599113941192627,
"epoch": 1.9842105263157894,
"grad_norm": 0.01323725562542677,
"learning_rate": 1e-06,
"loss": 0.1011,
"step": 754
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.2421875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.22974644601345062,
"epoch": 1.986842105263158,
"grad_norm": 0.02560954913496971,
"learning_rate": 1e-06,
"loss": 0.1258,
"step": 755
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.2734375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4609375,
"entropy": 0.24319110810756683,
"epoch": 1.9894736842105263,
"grad_norm": 0.5963929891586304,
"learning_rate": 1e-06,
"loss": 0.1971,
"step": 756
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14817.0,
"completions/mean_length": 1997.1171875,
"completions/mean_terminated_length": 1164.818115234375,
"completions/min_length": 170.0,
"completions/min_terminated_length": 170.0,
"entropy": 0.22843757271766663,
"epoch": 1.9921052631578946,
"frac_reward_zero_std": 0.09375,
"grad_norm": 1.2587209939956665,
"learning_rate": 1e-06,
"loss": 0.0798,
"num_tokens": 236294732.0,
"reward": 0.694200873374939,
"reward_std": 0.2883387506008148,
"rewards/progression_diversity/mean": -0.07210341095924377,
"rewards/progression_diversity/std": 0.2020467221736908,
"rewards/symbolic_reward_accuracy/mean": 0.7578125,
"rewards/symbolic_reward_accuracy/std": 0.42882615327835083,
"rewards/symbolic_reward_partial_score/mean": 0.8216145634651184,
"rewards/symbolic_reward_partial_score/std": 0.3572600483894348,
"rewards/tag_count_reward/mean": -0.0625,
"rewards/tag_count_reward/std": 0.2422981858253479,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0223069190979004,
"sampling/importance_sampling_ratio/min": 8.752450397611482e-24,
"sampling/sampling_logp_difference/max": 53.092708587646484,
"sampling/sampling_logp_difference/mean": 0.10721838474273682,
"step": 757
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.46875,
"entropy": 0.244800366461277,
"epoch": 1.9947368421052631,
"grad_norm": 0.015508892014622688,
"learning_rate": 1e-06,
"loss": 0.1874,
"step": 758
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.1953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5390625,
"entropy": 0.26591283082962036,
"epoch": 1.9973684210526317,
"grad_norm": 0.013314544223248959,
"learning_rate": 1e-06,
"loss": 0.1516,
"step": 759
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.2109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5546875,
"entropy": 0.2783430367708206,
"epoch": 2.0,
"grad_norm": 0.01680191606283188,
"learning_rate": 1e-06,
"loss": 0.1542,
"step": 760
},
{
"epoch": 2.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.020263671875,
"eval_completions/max_length": 15967.3125,
"eval_completions/max_terminated_length": 12734.125,
"eval_completions/mean_length": 1709.68359375,
"eval_completions/mean_terminated_length": 1407.859390258789,
"eval_completions/min_length": 175.84375,
"eval_completions/min_terminated_length": 175.84375,
"eval_entropy": 0.18563631316646934,
"eval_frac_reward_zero_std": 0.09375,
"eval_loss": 0.05905697122216225,
"eval_num_tokens": 236294732.0,
"eval_reward": 0.6300447043031454,
"eval_reward_std": 0.32271731412038207,
"eval_rewards/progression_diversity/mean": -0.08879435807466507,
"eval_rewards/progression_diversity/std": 0.2320320950821042,
"eval_rewards/symbolic_reward_accuracy/mean": 0.681396484375,
"eval_rewards/symbolic_reward_accuracy/std": 0.45585051737725735,
"eval_rewards/symbolic_reward_partial_score/mean": 0.7582194041460752,
"eval_rewards/symbolic_reward_partial_score/std": 0.38103948533535004,
"eval_rewards/tag_count_reward/mean": -0.0537109375,
"eval_rewards/tag_count_reward/std": 0.22261275839991868,
"eval_runtime": 3967.3544,
"eval_samples_per_second": 0.063,
"eval_sampling/importance_sampling_ratio/max": 2.0,
"eval_sampling/importance_sampling_ratio/mean": 1.0347554087638855,
"eval_sampling/importance_sampling_ratio/min": 1.7375132229474076e-18,
"eval_sampling/sampling_logp_difference/max": 45.634475350379944,
"eval_sampling/sampling_logp_difference/mean": 0.12127477861940861,
"eval_steps_per_second": 0.001,
"step": 760
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16118.0,
"completions/mean_length": 1692.767578125,
"completions/mean_terminated_length": 1340.1781005859375,
"completions/min_length": 188.0,
"completions/min_terminated_length": 188.0,
"entropy": 0.22220094501972198,
"epoch": 2.0026315789473683,
"frac_reward_zero_std": 0.1875,
"grad_norm": 3.1163506507873535,
"learning_rate": 1e-06,
"loss": 0.0849,
"num_tokens": 237554389.0,
"reward": 0.6846272945404053,
"reward_std": 0.28297704458236694,
"rewards/progression_diversity/mean": -0.07242967188358307,
"rewards/progression_diversity/std": 0.20485153794288635,
"rewards/symbolic_reward_accuracy/mean": 0.744140625,
"rewards/symbolic_reward_accuracy/std": 0.43676990270614624,
"rewards/symbolic_reward_partial_score/mean": 0.8125,
"rewards/symbolic_reward_partial_score/std": 0.35843971371650696,
"rewards/tag_count_reward/mean": -0.048828125,
"rewards/tag_count_reward/std": 0.2157193273305893,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0238771438598633,
"sampling/importance_sampling_ratio/min": 3.568013560231561e-26,
"sampling/sampling_logp_difference/max": 58.5952033996582,
"sampling/sampling_logp_difference/mean": 0.12895509600639343,
"step": 761
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.21875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4921875,
"entropy": 0.2493431717157364,
"epoch": 2.0052631578947366,
"grad_norm": 0.010103636421263218,
"learning_rate": 1e-06,
"loss": 0.165,
"step": 762
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.1953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.3100139796733856,
"epoch": 2.0078947368421054,
"grad_norm": 0.04030955210328102,
"learning_rate": 1e-06,
"loss": 0.149,
"step": 763
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.4296875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.53125,
"entropy": 0.3300608843564987,
"epoch": 2.0105263157894737,
"grad_norm": 0.010126029141247272,
"learning_rate": 1e-06,
"loss": 0.0765,
"step": 764
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14986.0,
"completions/mean_length": 1680.546875,
"completions/mean_terminated_length": 1447.1588134765625,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"entropy": 0.21983007341623306,
"epoch": 2.013157894736842,
"frac_reward_zero_std": 0.15625,
"grad_norm": 3.134493589401245,
"learning_rate": 1e-06,
"loss": 0.1212,
"num_tokens": 238821645.0,
"reward": 0.6834567785263062,
"reward_std": 0.27827733755111694,
"rewards/progression_diversity/mean": -0.07717426866292953,
"rewards/progression_diversity/std": 0.20578458905220032,
"rewards/symbolic_reward_accuracy/mean": 0.7421875,
"rewards/symbolic_reward_accuracy/std": 0.43785804510116577,
"rewards/symbolic_reward_partial_score/mean": 0.81396484375,
"rewards/symbolic_reward_partial_score/std": 0.35770609974861145,
"rewards/tag_count_reward/mean": -0.052734375,
"rewards/tag_count_reward/std": 0.22372129559516907,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0259695053100586,
"sampling/importance_sampling_ratio/min": 3.113477761470245e-26,
"sampling/sampling_logp_difference/max": 58.73147201538086,
"sampling/sampling_logp_difference/mean": 0.13548921048641205,
"step": 765
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.390625,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5390625,
"entropy": 0.3355230987071991,
"epoch": 2.0157894736842104,
"grad_norm": 0.014872807078063488,
"learning_rate": 1e-06,
"loss": 0.0679,
"step": 766
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4921875,
"entropy": 0.5302972793579102,
"epoch": 2.018421052631579,
"grad_norm": 0.02062629908323288,
"learning_rate": 1e-06,
"loss": 0.0955,
"step": 767
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5,
"entropy": 0.718237966299057,
"epoch": 2.0210526315789474,
"grad_norm": 0.02212103269994259,
"learning_rate": 1e-06,
"loss": 0.1666,
"step": 768
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15111.0,
"completions/mean_length": 1686.498046875,
"completions/mean_terminated_length": 1242.911376953125,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"entropy": 0.23345400393009186,
"epoch": 2.0236842105263158,
"frac_reward_zero_std": 0.0625,
"grad_norm": 0.634035050868988,
"learning_rate": 1e-06,
"loss": 0.074,
"num_tokens": 240076428.0,
"reward": 0.6818051934242249,
"reward_std": 0.29768824577331543,
"rewards/progression_diversity/mean": -0.07144007831811905,
"rewards/progression_diversity/std": 0.19266043603420258,
"rewards/symbolic_reward_accuracy/mean": 0.740234375,
"rewards/symbolic_reward_accuracy/std": 0.4389347732067108,
"rewards/symbolic_reward_partial_score/mean": 0.8134765625,
"rewards/symbolic_reward_partial_score/std": 0.3538402020931244,
"rewards/tag_count_reward/mean": -0.056640625,
"rewards/tag_count_reward/std": 0.23138070106506348,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0273222923278809,
"sampling/importance_sampling_ratio/min": 1.1702893084804623e-27,
"sampling/sampling_logp_difference/max": 62.01254653930664,
"sampling/sampling_logp_difference/mean": 0.14426520466804504,
"step": 769
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.2265625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.2532157674431801,
"epoch": 2.026315789473684,
"grad_norm": 0.07092837989330292,
"learning_rate": 1e-06,
"loss": 0.1038,
"step": 770
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.2421875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.3199596703052521,
"epoch": 2.028947368421053,
"grad_norm": 0.031068623065948486,
"learning_rate": 1e-06,
"loss": 0.0876,
"step": 771
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.2421875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.3354818820953369,
"epoch": 2.031578947368421,
"grad_norm": 0.03109137900173664,
"learning_rate": 1e-06,
"loss": 0.1269,
"step": 772
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16351.0,
"completions/mean_length": 2114.2578125,
"completions/mean_terminated_length": 1624.1859130859375,
"completions/min_length": 181.0,
"completions/min_terminated_length": 181.0,
"entropy": 0.2218097448348999,
"epoch": 2.0342105263157895,
"frac_reward_zero_std": 0.0625,
"grad_norm": 11.015893936157227,
"learning_rate": 1e-06,
"loss": 0.1472,
"num_tokens": 241569232.0,
"reward": 0.6609280109405518,
"reward_std": 0.3157079815864563,
"rewards/progression_diversity/mean": -0.09372584521770477,
"rewards/progression_diversity/std": 0.21451076865196228,
"rewards/symbolic_reward_accuracy/mean": 0.716796875,
"rewards/symbolic_reward_accuracy/std": 0.4509948492050171,
"rewards/symbolic_reward_partial_score/mean": 0.7921549081802368,
"rewards/symbolic_reward_partial_score/std": 0.37911295890808105,
"rewards/tag_count_reward/mean": -0.05859375,
"rewards/tag_count_reward/std": 0.23509246110916138,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.011353611946106,
"sampling/importance_sampling_ratio/min": 4.881230540264573e-27,
"sampling/sampling_logp_difference/max": 60.58440017700195,
"sampling/sampling_logp_difference/mean": 0.21283593773841858,
"step": 773
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.484375,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.6640625,
"entropy": 0.29898831248283386,
"epoch": 2.036842105263158,
"grad_norm": 0.006906085181981325,
"learning_rate": 1e-06,
"loss": 0.1731,
"step": 774
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.5546875,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.6953125,
"entropy": 0.378355011343956,
"epoch": 2.039473684210526,
"grad_norm": 0.26340046525001526,
"learning_rate": 1e-06,
"loss": 0.1015,
"step": 775
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.546875,
"clip_ratio/low_mean": 0.203125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.75,
"entropy": 0.5297541320323944,
"epoch": 2.042105263157895,
"grad_norm": 0.008265807293355465,
"learning_rate": 1e-06,
"loss": 0.1864,
"step": 776
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13564.0,
"completions/mean_length": 1399.826171875,
"completions/mean_terminated_length": 1101.336669921875,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"entropy": 0.23068619519472122,
"epoch": 2.044736842105263,
"frac_reward_zero_std": 0.1875,
"grad_norm": 4.750518321990967,
"learning_rate": 1e-06,
"loss": 0.1014,
"num_tokens": 242665047.0,
"reward": 0.7356716394424438,
"reward_std": 0.2675522565841675,
"rewards/progression_diversity/mean": -0.05099809169769287,
"rewards/progression_diversity/std": 0.1577788144350052,
"rewards/symbolic_reward_accuracy/mean": 0.806640625,
"rewards/symbolic_reward_accuracy/std": 0.39531853795051575,
"rewards/symbolic_reward_partial_score/mean": 0.8575845956802368,
"rewards/symbolic_reward_partial_score/std": 0.3261964023113251,
"rewards/tag_count_reward/mean": -0.05078125,
"rewards/tag_count_reward/std": 0.21976542472839355,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.020838737487793,
"sampling/importance_sampling_ratio/min": 2.984163448692399e-29,
"sampling/sampling_logp_difference/max": 65.68164825439453,
"sampling/sampling_logp_difference/mean": 0.23801082372665405,
"step": 777
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3359375,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5,
"entropy": 0.27487921714782715,
"epoch": 2.0473684210526315,
"grad_norm": 0.010955928824841976,
"learning_rate": 1e-06,
"loss": 0.1502,
"step": 778
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.3157334327697754,
"epoch": 2.05,
"grad_norm": 0.04367856681346893,
"learning_rate": 1e-06,
"loss": 0.0794,
"step": 779
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.31336382031440735,
"epoch": 2.0526315789473686,
"grad_norm": 0.021015914157032967,
"learning_rate": 1e-06,
"loss": 0.0561,
"step": 780
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15221.0,
"completions/mean_length": 1509.474609375,
"completions/mean_terminated_length": 1152.486083984375,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"entropy": 0.22252874076366425,
"epoch": 2.055263157894737,
"frac_reward_zero_std": 0.09375,
"grad_norm": 1.6408276557922363,
"learning_rate": 1e-06,
"loss": 0.1134,
"num_tokens": 243863402.0,
"reward": 0.7193964123725891,
"reward_std": 0.28412267565727234,
"rewards/progression_diversity/mean": -0.037899717688560486,
"rewards/progression_diversity/std": 0.12527695298194885,
"rewards/symbolic_reward_accuracy/mean": 0.78515625,
"rewards/symbolic_reward_accuracy/std": 0.4111155867576599,
"rewards/symbolic_reward_partial_score/mean": 0.8458659052848816,
"rewards/symbolic_reward_partial_score/std": 0.32760530710220337,
"rewards/tag_count_reward/mean": -0.05078125,
"rewards/tag_count_reward/std": 0.21976542472839355,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0266220569610596,
"sampling/importance_sampling_ratio/min": 4.3695186577606616e-27,
"sampling/sampling_logp_difference/max": 60.69514465332031,
"sampling/sampling_logp_difference/mean": 0.12954550981521606,
"step": 781
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.22228249162435532,
"epoch": 2.057894736842105,
"grad_norm": 0.19859381020069122,
"learning_rate": 1e-06,
"loss": 0.131,
"step": 782
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.1953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.20911472290754318,
"epoch": 2.0605263157894735,
"grad_norm": 0.02292071282863617,
"learning_rate": 1e-06,
"loss": 0.1517,
"step": 783
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.2361936792731285,
"epoch": 2.0631578947368423,
"grad_norm": 0.015427610836923122,
"learning_rate": 1e-06,
"loss": 0.0483,
"step": 784
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15893.0,
"completions/mean_length": 1741.46875,
"completions/mean_terminated_length": 1238.593994140625,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"entropy": 0.21045714616775513,
"epoch": 2.0657894736842106,
"frac_reward_zero_std": 0.125,
"grad_norm": 4.345335006713867,
"learning_rate": 1e-06,
"loss": 0.1222,
"num_tokens": 245182202.0,
"reward": 0.7289832830429077,
"reward_std": 0.29325735569000244,
"rewards/progression_diversity/mean": -0.046007223427295685,
"rewards/progression_diversity/std": 0.139408677816391,
"rewards/symbolic_reward_accuracy/mean": 0.802734375,
"rewards/symbolic_reward_accuracy/std": 0.3983237147331238,
"rewards/symbolic_reward_partial_score/mean": 0.8435872793197632,
"rewards/symbolic_reward_partial_score/std": 0.34268614649772644,
"rewards/tag_count_reward/mean": -0.052734375,
"rewards/tag_count_reward/std": 0.22372129559516907,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0198302268981934,
"sampling/importance_sampling_ratio/min": 9.978149817821409e-34,
"sampling/sampling_logp_difference/max": 75.98749542236328,
"sampling/sampling_logp_difference/mean": 0.21147558093070984,
"step": 785
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.2109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.2157856523990631,
"epoch": 2.068421052631579,
"grad_norm": 0.01274153497070074,
"learning_rate": 1e-06,
"loss": 0.2087,
"step": 786
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.2510286718606949,
"epoch": 2.0710526315789473,
"grad_norm": 0.06488237529993057,
"learning_rate": 1e-06,
"loss": 0.1405,
"step": 787
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.453125,
"entropy": 0.2752929925918579,
"epoch": 2.0736842105263156,
"grad_norm": 0.018524806946516037,
"learning_rate": 1e-06,
"loss": 0.1161,
"step": 788
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14714.0,
"completions/mean_length": 1852.96875,
"completions/mean_terminated_length": 1262.2763671875,
"completions/min_length": 147.0,
"completions/min_terminated_length": 147.0,
"entropy": 0.22035640478134155,
"epoch": 2.0763157894736843,
"frac_reward_zero_std": 0.1875,
"grad_norm": 3.2875747680664062,
"learning_rate": 1e-06,
"loss": 0.1068,
"num_tokens": 246517706.0,
"reward": 0.6830227971076965,
"reward_std": 0.23248738050460815,
"rewards/progression_diversity/mean": -0.04245023429393768,
"rewards/progression_diversity/std": 0.12012242525815964,
"rewards/symbolic_reward_accuracy/mean": 0.740234375,
"rewards/symbolic_reward_accuracy/std": 0.4389347732067108,
"rewards/symbolic_reward_partial_score/mean": 0.8165690302848816,
"rewards/symbolic_reward_partial_score/std": 0.35112887620925903,
"rewards/tag_count_reward/mean": -0.056640625,
"rewards/tag_count_reward/std": 0.23138070106506348,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.023137092590332,
"sampling/importance_sampling_ratio/min": 1.018668067894736e-33,
"sampling/sampling_logp_difference/max": 75.96681213378906,
"sampling/sampling_logp_difference/mean": 0.3000330924987793,
"step": 789
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.453125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.578125,
"entropy": 0.23726185411214828,
"epoch": 2.0789473684210527,
"grad_norm": 0.875003457069397,
"learning_rate": 1e-06,
"loss": 0.1011,
"step": 790
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3984375,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5859375,
"entropy": 0.3143727779388428,
"epoch": 2.081578947368421,
"grad_norm": 0.024129139259457588,
"learning_rate": 1e-06,
"loss": 0.1284,
"step": 791
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.4921875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.6015625,
"entropy": 0.2929740250110626,
"epoch": 2.0842105263157893,
"grad_norm": 0.013345804065465927,
"learning_rate": 1e-06,
"loss": 0.0514,
"step": 792
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.048828125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15700.0,
"completions/mean_length": 1659.0625,
"completions/mean_terminated_length": 903.1622314453125,
"completions/min_length": 171.0,
"completions/min_terminated_length": 171.0,
"entropy": 0.20717183500528336,
"epoch": 2.086842105263158,
"frac_reward_zero_std": 0.21875,
"grad_norm": 6.014965057373047,
"learning_rate": 1e-06,
"loss": 0.1198,
"num_tokens": 247780298.0,
"reward": 0.7235679626464844,
"reward_std": 0.24241438508033752,
"rewards/progression_diversity/mean": -0.030903812497854233,
"rewards/progression_diversity/std": 0.11096394807100296,
"rewards/symbolic_reward_accuracy/mean": 0.7890625,
"rewards/symbolic_reward_accuracy/std": 0.4083731174468994,
"rewards/symbolic_reward_partial_score/mean": 0.85693359375,
"rewards/symbolic_reward_partial_score/std": 0.3159598112106323,
"rewards/tag_count_reward/mean": -0.06640625,
"rewards/tag_count_reward/std": 0.2492343932390213,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0264015197753906,
"sampling/importance_sampling_ratio/min": 1.2161178811991868e-37,
"sampling/sampling_logp_difference/max": 84.99998474121094,
"sampling/sampling_logp_difference/mean": 0.20792850852012634,
"step": 793
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.20646440982818604,
"epoch": 2.0894736842105264,
"grad_norm": 0.08151978254318237,
"learning_rate": 1e-06,
"loss": 0.1191,
"step": 794
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.22666236013174057,
"epoch": 2.0921052631578947,
"grad_norm": 0.016352776437997818,
"learning_rate": 1e-06,
"loss": 0.1051,
"step": 795
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3984375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.515625,
"entropy": 0.23344239592552185,
"epoch": 2.094736842105263,
"grad_norm": 0.0139634869992733,
"learning_rate": 1e-06,
"loss": 0.1076,
"step": 796
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16007.0,
"completions/mean_length": 1441.34375,
"completions/mean_terminated_length": 959.3225708007812,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"entropy": 0.23309873044490814,
"epoch": 2.0973684210526318,
"frac_reward_zero_std": 0.3125,
"grad_norm": 4.238341808319092,
"learning_rate": 1e-06,
"loss": 0.028,
"num_tokens": 248903546.0,
"reward": 0.7395845651626587,
"reward_std": 0.195578932762146,
"rewards/progression_diversity/mean": -0.030801229178905487,
"rewards/progression_diversity/std": 0.1092456728219986,
"rewards/symbolic_reward_accuracy/mean": 0.802734375,
"rewards/symbolic_reward_accuracy/std": 0.3983237147331238,
"rewards/symbolic_reward_partial_score/mean": 0.8758137822151184,
"rewards/symbolic_reward_partial_score/std": 0.29930591583251953,
"rewards/tag_count_reward/mean": -0.044921875,
"rewards/tag_count_reward/std": 0.20733514428138733,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0348535776138306,
"sampling/importance_sampling_ratio/min": 7.160635152699815e-43,
"sampling/sampling_logp_difference/max": 97.04222106933594,
"sampling/sampling_logp_difference/mean": 0.22585123777389526,
"step": 797
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.22810588777065277,
"epoch": 2.1,
"grad_norm": 0.013128053396940231,
"learning_rate": 1e-06,
"loss": 0.0723,
"step": 798
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.24390096962451935,
"epoch": 2.1026315789473684,
"grad_norm": 0.007489512674510479,
"learning_rate": 1e-06,
"loss": 0.1571,
"step": 799
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.22683186829090118,
"epoch": 2.1052631578947367,
"grad_norm": 0.012858620844781399,
"learning_rate": 1e-06,
"loss": 0.128,
"step": 800
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13867.0,
"completions/mean_length": 1304.953125,
"completions/mean_terminated_length": 849.85107421875,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"entropy": 0.2136317864060402,
"epoch": 2.1078947368421055,
"frac_reward_zero_std": 0.1875,
"grad_norm": 9.192523002624512,
"learning_rate": 1e-06,
"loss": 0.1348,
"num_tokens": 249969474.0,
"reward": 0.7585318684577942,
"reward_std": 0.23733538389205933,
"rewards/progression_diversity/mean": -0.030604764819145203,
"rewards/progression_diversity/std": 0.1174231469631195,
"rewards/symbolic_reward_accuracy/mean": 0.82421875,
"rewards/symbolic_reward_accuracy/std": 0.3810062110424042,
"rewards/symbolic_reward_partial_score/mean": 0.8914388418197632,
"rewards/symbolic_reward_partial_score/std": 0.2718009948730469,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0298779010772705,
"sampling/importance_sampling_ratio/min": 2.0962023727834939e-41,
"sampling/sampling_logp_difference/max": 93.6658706665039,
"sampling/sampling_logp_difference/mean": 0.1556258201599121,
"step": 801
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.20574460923671722,
"epoch": 2.110526315789474,
"grad_norm": 0.00943561177700758,
"learning_rate": 1e-06,
"loss": 0.1348,
"step": 802
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.22100205719470978,
"epoch": 2.113157894736842,
"grad_norm": 0.018543722108006477,
"learning_rate": 1e-06,
"loss": 0.0386,
"step": 803
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.23784782737493515,
"epoch": 2.1157894736842104,
"grad_norm": 0.1003841683268547,
"learning_rate": 1e-06,
"loss": 0.0778,
"step": 804
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13121.0,
"completions/mean_length": 1769.97265625,
"completions/mean_terminated_length": 860.3859252929688,
"completions/min_length": 170.0,
"completions/min_terminated_length": 170.0,
"entropy": 0.22240455448627472,
"epoch": 2.1184210526315788,
"frac_reward_zero_std": 0.125,
"grad_norm": 10.562141418457031,
"learning_rate": 1e-06,
"loss": 0.1285,
"num_tokens": 251296532.0,
"reward": 0.736179769039154,
"reward_std": 0.24343962967395782,
"rewards/progression_diversity/mean": -0.03436863049864769,
"rewards/progression_diversity/std": 0.12254931777715683,
"rewards/symbolic_reward_accuracy/mean": 0.8046875,
"rewards/symbolic_reward_accuracy/std": 0.3968288004398346,
"rewards/symbolic_reward_partial_score/mean": 0.8671875,
"rewards/symbolic_reward_partial_score/std": 0.31175029277801514,
"rewards/tag_count_reward/mean": -0.064453125,
"rewards/tag_count_reward/std": 0.24579854309558868,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0263206958770752,
"sampling/importance_sampling_ratio/min": 1.7586295727276454e-42,
"sampling/sampling_logp_difference/max": 96.14400482177734,
"sampling/sampling_logp_difference/mean": 0.2061021327972412,
"step": 805
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4609375,
"entropy": 0.215851292014122,
"epoch": 2.1210526315789475,
"grad_norm": 0.012161717750132084,
"learning_rate": 1e-06,
"loss": 0.1275,
"step": 806
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3984375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5,
"entropy": 0.20706567913293839,
"epoch": 2.123684210526316,
"grad_norm": 0.007896292954683304,
"learning_rate": 1e-06,
"loss": 0.1679,
"step": 807
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.390625,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5234375,
"entropy": 0.21126502752304077,
"epoch": 2.126315789473684,
"grad_norm": 0.007001427933573723,
"learning_rate": 1e-06,
"loss": 0.1365,
"step": 808
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11642.0,
"completions/mean_length": 1819.16796875,
"completions/mean_terminated_length": 848.17919921875,
"completions/min_length": 183.0,
"completions/min_terminated_length": 183.0,
"entropy": 0.20564715564250946,
"epoch": 2.1289473684210525,
"frac_reward_zero_std": 0.25,
"grad_norm": 6.921549320220947,
"learning_rate": 1e-06,
"loss": 0.1125,
"num_tokens": 252653450.0,
"reward": 0.7197491526603699,
"reward_std": 0.20420284569263458,
"rewards/progression_diversity/mean": -0.04169023782014847,
"rewards/progression_diversity/std": 0.13636480271816254,
"rewards/symbolic_reward_accuracy/mean": 0.779296875,
"rewards/symbolic_reward_accuracy/std": 0.4151262938976288,
"rewards/symbolic_reward_partial_score/mean": 0.8673502206802368,
"rewards/symbolic_reward_partial_score/std": 0.3025065064430237,
"rewards/tag_count_reward/mean": -0.076171875,
"rewards/tag_count_reward/std": 0.26553234457969666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0277680158615112,
"sampling/importance_sampling_ratio/min": 5.605193857299268e-45,
"sampling/sampling_logp_difference/max": 102.0,
"sampling/sampling_logp_difference/mean": 0.18684858083724976,
"step": 809
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.20571620762348175,
"epoch": 2.1315789473684212,
"grad_norm": 0.033720117062330246,
"learning_rate": 1e-06,
"loss": 0.1593,
"step": 810
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4765625,
"entropy": 0.22204747796058655,
"epoch": 2.1342105263157896,
"grad_norm": 0.20377802848815918,
"learning_rate": 1e-06,
"loss": 0.1086,
"step": 811
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.359375,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.46875,
"entropy": 0.20914125442504883,
"epoch": 2.136842105263158,
"grad_norm": 0.025145720690488815,
"learning_rate": 1e-06,
"loss": 0.1232,
"step": 812
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15292.0,
"completions/mean_length": 1469.455078125,
"completions/mean_terminated_length": 988.3406982421875,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"entropy": 0.21566122770309448,
"epoch": 2.139473684210526,
"frac_reward_zero_std": 0.21875,
"grad_norm": 1.5204887390136719,
"learning_rate": 1e-06,
"loss": 0.0784,
"num_tokens": 253832819.0,
"reward": 0.7044533491134644,
"reward_std": 0.23354381322860718,
"rewards/progression_diversity/mean": -0.033183254301548004,
"rewards/progression_diversity/std": 0.1218782439827919,
"rewards/symbolic_reward_accuracy/mean": 0.75,
"rewards/symbolic_reward_accuracy/std": 0.43343618512153625,
"rewards/symbolic_reward_partial_score/mean": 0.8701171875,
"rewards/symbolic_reward_partial_score/std": 0.28412559628486633,
"rewards/tag_count_reward/mean": -0.0625,
"rewards/tag_count_reward/std": 0.2422981858253479,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0307013988494873,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 105.5,
"sampling/sampling_logp_difference/mean": 0.2200557291507721,
"step": 813
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.2126343920826912,
"epoch": 2.1421052631578945,
"grad_norm": 0.019540801644325256,
"learning_rate": 1e-06,
"loss": 0.1643,
"step": 814
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.2253113090991974,
"epoch": 2.1447368421052633,
"grad_norm": 0.034518882632255554,
"learning_rate": 1e-06,
"loss": 0.0848,
"step": 815
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.21958737820386887,
"epoch": 2.1473684210526316,
"grad_norm": 0.022237155586481094,
"learning_rate": 1e-06,
"loss": 0.0616,
"step": 816
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009765625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16089.0,
"completions/mean_length": 1241.01171875,
"completions/mean_terminated_length": 1091.672607421875,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"entropy": 0.2650395929813385,
"epoch": 2.15,
"frac_reward_zero_std": 0.25,
"grad_norm": 8.908076286315918,
"learning_rate": 1e-06,
"loss": 0.0686,
"num_tokens": 254887545.0,
"reward": 0.7385146617889404,
"reward_std": 0.21202999353408813,
"rewards/progression_diversity/mean": -0.04013665392994881,
"rewards/progression_diversity/std": 0.136456698179245,
"rewards/symbolic_reward_accuracy/mean": 0.806640625,
"rewards/symbolic_reward_accuracy/std": 0.39531853795051575,
"rewards/symbolic_reward_partial_score/mean": 0.8680013418197632,
"rewards/symbolic_reward_partial_score/std": 0.3045817017555237,
"rewards/tag_count_reward/mean": -0.0546875,
"rewards/tag_count_reward/std": 0.2275916188955307,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0395100116729736,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 112.35920715332031,
"sampling/sampling_logp_difference/mean": 0.4265938699245453,
"step": 817
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.2493005096912384,
"epoch": 2.1526315789473682,
"grad_norm": 0.020062845200300217,
"learning_rate": 1e-06,
"loss": 0.1432,
"step": 818
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.23568686097860336,
"epoch": 2.155263157894737,
"grad_norm": 0.028896203264594078,
"learning_rate": 1e-06,
"loss": 0.1101,
"step": 819
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.24035780876874924,
"epoch": 2.1578947368421053,
"grad_norm": 0.01008315198123455,
"learning_rate": 1e-06,
"loss": 0.0286,
"step": 820
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009765625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12528.0,
"completions/mean_length": 1109.306640625,
"completions/mean_terminated_length": 958.6686401367188,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"entropy": 0.3168548345565796,
"epoch": 2.1605263157894736,
"frac_reward_zero_std": 0.28125,
"grad_norm": 7.0420708656311035,
"learning_rate": 1e-06,
"loss": 0.079,
"num_tokens": 255865494.0,
"reward": 0.7541958093643188,
"reward_std": 0.2114800214767456,
"rewards/progression_diversity/mean": -0.03452695906162262,
"rewards/progression_diversity/std": 0.1269940882921219,
"rewards/symbolic_reward_accuracy/mean": 0.822265625,
"rewards/symbolic_reward_accuracy/std": 0.3826628625392914,
"rewards/symbolic_reward_partial_score/mean": 0.88818359375,
"rewards/symbolic_reward_partial_score/std": 0.2839567959308624,
"rewards/tag_count_reward/mean": -0.052734375,
"rewards/tag_count_reward/std": 0.22372129559516907,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0436235666275024,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 119.89215850830078,
"sampling/sampling_logp_difference/mean": 0.4233931005001068,
"step": 821
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.2538767158985138,
"epoch": 2.163157894736842,
"grad_norm": 0.014526182785630226,
"learning_rate": 1e-06,
"loss": 0.0951,
"step": 822
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.2911252975463867,
"epoch": 2.1657894736842107,
"grad_norm": 0.010871890932321548,
"learning_rate": 1e-06,
"loss": 0.0845,
"step": 823
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.2438528761267662,
"epoch": 2.168421052631579,
"grad_norm": 0.008227149024605751,
"learning_rate": 1e-06,
"loss": 0.1009,
"step": 824
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13903.0,
"completions/mean_length": 1524.4453125,
"completions/mean_terminated_length": 1198.1876220703125,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"entropy": 0.32260115444660187,
"epoch": 2.1710526315789473,
"frac_reward_zero_std": 0.28125,
"grad_norm": 18.730411529541016,
"learning_rate": 1e-06,
"loss": 0.0989,
"num_tokens": 257055834.0,
"reward": 0.7190285921096802,
"reward_std": 0.2212091088294983,
"rewards/progression_diversity/mean": -0.04050877317786217,
"rewards/progression_diversity/std": 0.1305409073829651,
"rewards/symbolic_reward_accuracy/mean": 0.78515625,
"rewards/symbolic_reward_accuracy/std": 0.4111155867576599,
"rewards/symbolic_reward_partial_score/mean": 0.8544921875,
"rewards/symbolic_reward_partial_score/std": 0.31850993633270264,
"rewards/tag_count_reward/mean": -0.080078125,
"rewards/tag_count_reward/std": 0.271679550409317,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0542526245117188,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 130.9848175048828,
"sampling/sampling_logp_difference/mean": 0.42119100689888,
"step": 825
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.28623804450035095,
"epoch": 2.1736842105263157,
"grad_norm": 0.009265235625207424,
"learning_rate": 1e-06,
"loss": 0.0857,
"step": 826
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.3048030138015747,
"epoch": 2.1763157894736844,
"grad_norm": 0.5141742825508118,
"learning_rate": 1e-06,
"loss": 0.0416,
"step": 827
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.33180132508277893,
"epoch": 2.1789473684210527,
"grad_norm": 0.01808958500623703,
"learning_rate": 1e-06,
"loss": 0.1253,
"step": 828
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14963.0,
"completions/mean_length": 1386.966796875,
"completions/mean_terminated_length": 996.2625122070312,
"completions/min_length": 175.0,
"completions/min_terminated_length": 175.0,
"entropy": 0.3035607635974884,
"epoch": 2.181578947368421,
"frac_reward_zero_std": 0.28125,
"grad_norm": 20.19339370727539,
"learning_rate": 1e-06,
"loss": 0.1055,
"num_tokens": 258146409.0,
"reward": 0.7440898418426514,
"reward_std": 0.20957735180854797,
"rewards/progression_diversity/mean": -0.029496124014258385,
"rewards/progression_diversity/std": 0.10593532025814056,
"rewards/symbolic_reward_accuracy/mean": 0.814453125,
"rewards/symbolic_reward_accuracy/std": 0.38912075757980347,
"rewards/symbolic_reward_partial_score/mean": 0.8712565302848816,
"rewards/symbolic_reward_partial_score/std": 0.30432677268981934,
"rewards/tag_count_reward/mean": -0.056640625,
"rewards/tag_count_reward/std": 0.23138070106506348,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.059896469116211,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 135.2486572265625,
"sampling/sampling_logp_difference/mean": 0.5292673707008362,
"step": 829
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.31164059042930603,
"epoch": 2.1842105263157894,
"grad_norm": 4.756748199462891,
"learning_rate": 1e-06,
"loss": 0.0966,
"step": 830
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.32979193329811096,
"epoch": 2.1868421052631577,
"grad_norm": 0.020023655146360397,
"learning_rate": 1e-06,
"loss": 0.1029,
"step": 831
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.30019381642341614,
"epoch": 2.1894736842105265,
"grad_norm": 0.01554365735501051,
"learning_rate": 1e-06,
"loss": 0.0552,
"step": 832
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14676.0,
"completions/mean_length": 1498.107421875,
"completions/mean_terminated_length": 955.7064819335938,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"entropy": 0.4383620619773865,
"epoch": 2.192105263157895,
"frac_reward_zero_std": 0.21875,
"grad_norm": 26.805294036865234,
"learning_rate": 1e-06,
"loss": 0.1879,
"num_tokens": 259308064.0,
"reward": 0.708129346370697,
"reward_std": 0.2203439176082611,
"rewards/progression_diversity/mean": -0.03179560601711273,
"rewards/progression_diversity/std": 0.11374954134225845,
"rewards/symbolic_reward_accuracy/mean": 0.765625,
"rewards/symbolic_reward_accuracy/std": 0.42402184009552,
"rewards/symbolic_reward_partial_score/mean": 0.8465169072151184,
"rewards/symbolic_reward_partial_score/std": 0.31652384996414185,
"rewards/tag_count_reward/mean": -0.048828125,
"rewards/tag_count_reward/std": 0.2157193273305893,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.056193470954895,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 135.6931915283203,
"sampling/sampling_logp_difference/mean": 0.8543011546134949,
"step": 833
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.3004182279109955,
"epoch": 2.194736842105263,
"grad_norm": 0.017624402418732643,
"learning_rate": 1e-06,
"loss": 0.0347,
"step": 834
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.3438268452882767,
"epoch": 2.1973684210526314,
"grad_norm": 0.015222882851958275,
"learning_rate": 1e-06,
"loss": 0.1393,
"step": 835
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.32134924829006195,
"epoch": 2.2,
"grad_norm": 0.017897766083478928,
"learning_rate": 1e-06,
"loss": 0.0976,
"step": 836
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13689.0,
"completions/mean_length": 1387.171875,
"completions/mean_terminated_length": 903.4031982421875,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"entropy": 0.41330593824386597,
"epoch": 2.2026315789473685,
"frac_reward_zero_std": 0.375,
"grad_norm": 17.12291717529297,
"learning_rate": 1e-06,
"loss": 0.1254,
"num_tokens": 260406584.0,
"reward": 0.7749391794204712,
"reward_std": 0.17058953642845154,
"rewards/progression_diversity/mean": -0.020733974874019623,
"rewards/progression_diversity/std": 0.08155296742916107,
"rewards/symbolic_reward_accuracy/mean": 0.853515625,
"rewards/symbolic_reward_accuracy/std": 0.35393697023391724,
"rewards/symbolic_reward_partial_score/mean": 0.8982747793197632,
"rewards/symbolic_reward_partial_score/std": 0.27264928817749023,
"rewards/tag_count_reward/mean": -0.064453125,
"rewards/tag_count_reward/std": 0.24579854309558868,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0631152391433716,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 136.99832153320312,
"sampling/sampling_logp_difference/mean": 0.695594310760498,
"step": 837
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.40689145028591156,
"epoch": 2.205263157894737,
"grad_norm": 0.004986094310879707,
"learning_rate": 1e-06,
"loss": 0.1718,
"step": 838
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.3304235339164734,
"epoch": 2.207894736842105,
"grad_norm": 0.013745912350714207,
"learning_rate": 1e-06,
"loss": 0.0868,
"step": 839
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.32582440972328186,
"epoch": 2.2105263157894735,
"grad_norm": 0.00617125304415822,
"learning_rate": 1e-06,
"loss": 0.0352,
"step": 840
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13147.0,
"completions/mean_length": 1191.048828125,
"completions/mean_terminated_length": 605.5192260742188,
"completions/min_length": 191.0,
"completions/min_terminated_length": 191.0,
"entropy": 0.3709278106689453,
"epoch": 2.213157894736842,
"frac_reward_zero_std": 0.25,
"grad_norm": 21.482324600219727,
"learning_rate": 1e-06,
"loss": 0.1633,
"num_tokens": 261411185.0,
"reward": 0.7811179161071777,
"reward_std": 0.21279799938201904,
"rewards/progression_diversity/mean": -0.018096236512064934,
"rewards/progression_diversity/std": 0.08493653684854507,
"rewards/symbolic_reward_accuracy/mean": 0.8515625,
"rewards/symbolic_reward_accuracy/std": 0.35588082671165466,
"rewards/symbolic_reward_partial_score/mean": 0.9129231572151184,
"rewards/symbolic_reward_partial_score/std": 0.247787743806839,
"rewards/tag_count_reward/mean": -0.03515625,
"rewards/tag_count_reward/std": 0.1843547374010086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0535836219787598,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 139.99432373046875,
"sampling/sampling_logp_difference/mean": 1.1103196144104004,
"step": 841
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.3213431090116501,
"epoch": 2.2157894736842105,
"grad_norm": 0.012497864663600922,
"learning_rate": 1e-06,
"loss": 0.0665,
"step": 842
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.46875,
"entropy": 0.3591870814561844,
"epoch": 2.218421052631579,
"grad_norm": 0.010890123434364796,
"learning_rate": 1e-06,
"loss": 0.1304,
"step": 843
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.2970151901245117,
"epoch": 2.221052631578947,
"grad_norm": 0.11356399953365326,
"learning_rate": 1e-06,
"loss": 0.0636,
"step": 844
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.048828125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14047.0,
"completions/mean_length": 1340.443359375,
"completions/mean_terminated_length": 568.1868896484375,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"entropy": 0.33584554493427277,
"epoch": 2.223684210526316,
"frac_reward_zero_std": 0.375,
"grad_norm": 19.238874435424805,
"learning_rate": 1e-06,
"loss": 0.1027,
"num_tokens": 262479924.0,
"reward": 0.7776129245758057,
"reward_std": 0.1951584815979004,
"rewards/progression_diversity/mean": -0.01703125424683094,
"rewards/progression_diversity/std": 0.0720345601439476,
"rewards/symbolic_reward_accuracy/mean": 0.8515625,
"rewards/symbolic_reward_accuracy/std": 0.35588082671165466,
"rewards/symbolic_reward_partial_score/mean": 0.90380859375,
"rewards/symbolic_reward_partial_score/std": 0.2672818601131439,
"rewards/tag_count_reward/mean": -0.04296875,
"rewards/tag_count_reward/std": 0.2029850035905838,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0570869445800781,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 143.1820831298828,
"sampling/sampling_logp_difference/mean": 0.9214514493942261,
"step": 845
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.3423171490430832,
"epoch": 2.2263157894736842,
"grad_norm": 0.010278112255036831,
"learning_rate": 1e-06,
"loss": 0.1566,
"step": 846
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.3419143259525299,
"epoch": 2.2289473684210526,
"grad_norm": 0.011319981887936592,
"learning_rate": 1e-06,
"loss": 0.0796,
"step": 847
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.3335099071264267,
"epoch": 2.231578947368421,
"grad_norm": 0.015382968820631504,
"learning_rate": 1e-06,
"loss": 0.0699,
"step": 848
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14075.0,
"completions/mean_length": 1389.193359375,
"completions/mean_terminated_length": 811.3001708984375,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"entropy": 0.35164597630500793,
"epoch": 2.2342105263157896,
"frac_reward_zero_std": 0.3125,
"grad_norm": 7.93258810043335,
"learning_rate": 1e-06,
"loss": 0.0673,
"num_tokens": 263594679.0,
"reward": 0.7638546228408813,
"reward_std": 0.19747570157051086,
"rewards/progression_diversity/mean": -0.025676485151052475,
"rewards/progression_diversity/std": 0.09972041100263596,
"rewards/symbolic_reward_accuracy/mean": 0.8359375,
"rewards/symbolic_reward_accuracy/std": 0.37069445848464966,
"rewards/symbolic_reward_partial_score/mean": 0.8953450322151184,
"rewards/symbolic_reward_partial_score/std": 0.2742253541946411,
"rewards/tag_count_reward/mean": -0.060546875,
"rewards/tag_count_reward/std": 0.2387305200099945,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.053678035736084,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 163.0,
"sampling/sampling_logp_difference/mean": 1.083064317703247,
"step": 849
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.3196953535079956,
"epoch": 2.236842105263158,
"grad_norm": 0.17636702954769135,
"learning_rate": 1e-06,
"loss": 0.0405,
"step": 850
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.37696947157382965,
"epoch": 2.2394736842105263,
"grad_norm": 0.02072129212319851,
"learning_rate": 1e-06,
"loss": 0.1117,
"step": 851
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.3250402361154556,
"epoch": 2.2421052631578946,
"grad_norm": 0.014444287866353989,
"learning_rate": 1e-06,
"loss": 0.1048,
"step": 852
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13668.0,
"completions/mean_length": 1304.669921875,
"completions/mean_terminated_length": 627.6387329101562,
"completions/min_length": 177.0,
"completions/min_terminated_length": 177.0,
"entropy": 0.3698349595069885,
"epoch": 2.2447368421052634,
"frac_reward_zero_std": 0.375,
"grad_norm": 217.0945587158203,
"learning_rate": 1e-06,
"loss": 0.1591,
"num_tokens": 264646126.0,
"reward": 0.7918961048126221,
"reward_std": 0.1911715716123581,
"rewards/progression_diversity/mean": -0.019374651834368706,
"rewards/progression_diversity/std": 0.08247331529855728,
"rewards/symbolic_reward_accuracy/mean": 0.873046875,
"rewards/symbolic_reward_accuracy/std": 0.33324605226516724,
"rewards/symbolic_reward_partial_score/mean": 0.9098306894302368,
"rewards/symbolic_reward_partial_score/std": 0.2634342610836029,
"rewards/tag_count_reward/mean": -0.046875,
"rewards/tag_count_reward/std": 0.21157780289649963,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0525051355361938,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 169.99769592285156,
"sampling/sampling_logp_difference/mean": 1.494558334350586,
"step": 853
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.30978524684906006,
"epoch": 2.2473684210526317,
"grad_norm": 0.008726145140826702,
"learning_rate": 1e-06,
"loss": 0.0635,
"step": 854
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.32608427107334137,
"epoch": 2.25,
"grad_norm": 0.004246350843459368,
"learning_rate": 1e-06,
"loss": 0.103,
"step": 855
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.308328241109848,
"epoch": 2.2526315789473683,
"grad_norm": 0.005722150672227144,
"learning_rate": 1e-06,
"loss": 0.0967,
"step": 856
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.064453125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13585.0,
"completions/mean_length": 1705.638671875,
"completions/mean_terminated_length": 694.3945922851562,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"entropy": 0.29534509778022766,
"epoch": 2.2552631578947366,
"frac_reward_zero_std": 0.15625,
"grad_norm": 196.59950256347656,
"learning_rate": 1e-06,
"loss": 0.1282,
"num_tokens": 265932725.0,
"reward": 0.7359683513641357,
"reward_std": 0.2585596442222595,
"rewards/progression_diversity/mean": -0.031095456331968307,
"rewards/progression_diversity/std": 0.1090308129787445,
"rewards/symbolic_reward_accuracy/mean": 0.80078125,
"rewards/symbolic_reward_accuracy/std": 0.39980348944664,
"rewards/symbolic_reward_partial_score/mean": 0.87548828125,
"rewards/symbolic_reward_partial_score/std": 0.2973478138446808,
"rewards/tag_count_reward/mean": -0.068359375,
"rewards/tag_count_reward/std": 0.25260838866233826,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0500712394714355,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 166.0,
"sampling/sampling_logp_difference/mean": 1.9202924966812134,
"step": 857
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.40625,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.546875,
"entropy": 0.377376526594162,
"epoch": 2.2578947368421054,
"grad_norm": 0.00950684119015932,
"learning_rate": 1e-06,
"loss": 0.1836,
"step": 858
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.359375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.453125,
"entropy": 0.4171202927827835,
"epoch": 2.2605263157894737,
"grad_norm": 41.98322296142578,
"learning_rate": 1e-06,
"loss": 0.2213,
"step": 859
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3671875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4609375,
"entropy": 0.3512898087501526,
"epoch": 2.263157894736842,
"grad_norm": 0.01232555229216814,
"learning_rate": 1e-06,
"loss": 0.1466,
"step": 860
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11700.0,
"completions/mean_length": 1474.6875,
"completions/mean_terminated_length": 612.165283203125,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"entropy": 0.3273746818304062,
"epoch": 2.2657894736842104,
"frac_reward_zero_std": 0.1875,
"grad_norm": 47.61751937866211,
"learning_rate": 1e-06,
"loss": 0.1721,
"num_tokens": 267097749.0,
"reward": 0.7948732972145081,
"reward_std": 0.23872321844100952,
"rewards/progression_diversity/mean": -0.02927570417523384,
"rewards/progression_diversity/std": 0.11151636391878128,
"rewards/symbolic_reward_accuracy/mean": 0.875,
"rewards/symbolic_reward_accuracy/std": 0.3310423493385315,
"rewards/symbolic_reward_partial_score/mean": 0.9148763418197632,
"rewards/symbolic_reward_partial_score/std": 0.253930926322937,
"rewards/tag_count_reward/mean": -0.04296875,
"rewards/tag_count_reward/std": 0.2029850035905838,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0462769269943237,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 157.0,
"sampling/sampling_logp_difference/mean": 1.056786060333252,
"step": 861
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.28751927614212036,
"epoch": 2.268421052631579,
"grad_norm": 0.012664435431361198,
"learning_rate": 1e-06,
"loss": 0.0962,
"step": 862
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.3128338009119034,
"epoch": 2.2710526315789474,
"grad_norm": 0.010503551922738552,
"learning_rate": 1e-06,
"loss": 0.1709,
"step": 863
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.299874484539032,
"epoch": 2.2736842105263158,
"grad_norm": 0.009202693589031696,
"learning_rate": 1e-06,
"loss": 0.1373,
"step": 864
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.044921875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12330.0,
"completions/mean_length": 1326.27734375,
"completions/mean_terminated_length": 618.0408935546875,
"completions/min_length": 192.0,
"completions/min_terminated_length": 192.0,
"entropy": 0.29694661498069763,
"epoch": 2.276315789473684,
"frac_reward_zero_std": 0.25,
"grad_norm": 28.95979881286621,
"learning_rate": 1e-06,
"loss": 0.1895,
"num_tokens": 268138083.0,
"reward": 0.7938084602355957,
"reward_std": 0.1921263337135315,
"rewards/progression_diversity/mean": -0.02833309769630432,
"rewards/progression_diversity/std": 0.11551596224308014,
"rewards/symbolic_reward_accuracy/mean": 0.869140625,
"rewards/symbolic_reward_accuracy/std": 0.33757632970809937,
"rewards/symbolic_reward_partial_score/mean": 0.92041015625,
"rewards/symbolic_reward_partial_score/std": 0.24347224831581116,
"rewards/tag_count_reward/mean": -0.03515625,
"rewards/tag_count_reward/std": 0.1843547374010086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0430631637573242,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 157.90789794921875,
"sampling/sampling_logp_difference/mean": 0.31643933057785034,
"step": 865
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.2519264966249466,
"epoch": 2.2789473684210524,
"grad_norm": 0.0065588075667619705,
"learning_rate": 1e-06,
"loss": 0.0924,
"step": 866
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.260475218296051,
"epoch": 2.281578947368421,
"grad_norm": 0.291818231344223,
"learning_rate": 1e-06,
"loss": 0.0611,
"step": 867
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.2639893591403961,
"epoch": 2.2842105263157895,
"grad_norm": 0.010040219873189926,
"learning_rate": 1e-06,
"loss": 0.0681,
"step": 868
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.052734375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10708.0,
"completions/mean_length": 1395.240234375,
"completions/mean_terminated_length": 560.814453125,
"completions/min_length": 213.0,
"completions/min_terminated_length": 213.0,
"entropy": 0.2564142793416977,
"epoch": 2.286842105263158,
"frac_reward_zero_std": 0.34375,
"grad_norm": 28.421417236328125,
"learning_rate": 1e-06,
"loss": 0.2116,
"num_tokens": 269248734.0,
"reward": 0.7830079793930054,
"reward_std": 0.18753179907798767,
"rewards/progression_diversity/mean": -0.03417276591062546,
"rewards/progression_diversity/std": 0.13364149630069733,
"rewards/symbolic_reward_accuracy/mean": 0.861328125,
"rewards/symbolic_reward_accuracy/std": 0.34594178199768066,
"rewards/symbolic_reward_partial_score/mean": 0.9034830331802368,
"rewards/symbolic_reward_partial_score/std": 0.2679261863231659,
"rewards/tag_count_reward/mean": -0.044921875,
"rewards/tag_count_reward/std": 0.20733514428138733,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.034959316253662,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 158.98016357421875,
"sampling/sampling_logp_difference/mean": 0.3048059940338135,
"step": 869
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.2360822707414627,
"epoch": 2.2894736842105265,
"grad_norm": 0.009712324477732182,
"learning_rate": 1e-06,
"loss": 0.0756,
"step": 870
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.2438521683216095,
"epoch": 2.292105263157895,
"grad_norm": 0.008457427844405174,
"learning_rate": 1e-06,
"loss": 0.0852,
"step": 871
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.23070208728313446,
"epoch": 2.294736842105263,
"grad_norm": 0.006932724732905626,
"learning_rate": 1e-06,
"loss": 0.1447,
"step": 872
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10716.0,
"completions/mean_length": 1633.05859375,
"completions/mean_terminated_length": 583.8284301757812,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"entropy": 0.24190223217010498,
"epoch": 2.2973684210526315,
"frac_reward_zero_std": 0.3125,
"grad_norm": 5.179454326629639,
"learning_rate": 1e-06,
"loss": 0.0743,
"num_tokens": 270490172.0,
"reward": 0.7714011669158936,
"reward_std": 0.20021489262580872,
"rewards/progression_diversity/mean": -0.03762088716030121,
"rewards/progression_diversity/std": 0.13623382151126862,
"rewards/symbolic_reward_accuracy/mean": 0.84375,
"rewards/symbolic_reward_accuracy/std": 0.36344730854034424,
"rewards/symbolic_reward_partial_score/mean": 0.9046223759651184,
"rewards/symbolic_reward_partial_score/std": 0.26515042781829834,
"rewards/tag_count_reward/mean": -0.05859375,
"rewards/tag_count_reward/std": 0.23509246110916138,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0347652435302734,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 162.99392700195312,
"sampling/sampling_logp_difference/mean": 0.2914007902145386,
"step": 873
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.24344737827777863,
"epoch": 2.3,
"grad_norm": 3.351227045059204,
"learning_rate": 1e-06,
"loss": 0.1364,
"step": 874
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.2388121336698532,
"epoch": 2.3026315789473686,
"grad_norm": 0.011248363181948662,
"learning_rate": 1e-06,
"loss": 0.154,
"step": 875
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.2308916077017784,
"epoch": 2.305263157894737,
"grad_norm": 0.24132992327213287,
"learning_rate": 1e-06,
"loss": 0.1312,
"step": 876
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10811.0,
"completions/mean_length": 1310.69921875,
"completions/mean_terminated_length": 569.3892822265625,
"completions/min_length": 178.0,
"completions/min_terminated_length": 178.0,
"entropy": 0.24580485373735428,
"epoch": 2.307894736842105,
"frac_reward_zero_std": 0.3125,
"grad_norm": 3.5337963104248047,
"learning_rate": 1e-06,
"loss": 0.0719,
"num_tokens": 271567746.0,
"reward": 0.7978794574737549,
"reward_std": 0.2167576551437378,
"rewards/progression_diversity/mean": -0.026513516902923584,
"rewards/progression_diversity/std": 0.11184799671173096,
"rewards/symbolic_reward_accuracy/mean": 0.880859375,
"rewards/symbolic_reward_accuracy/std": 0.32427072525024414,
"rewards/symbolic_reward_partial_score/mean": 0.9156901240348816,
"rewards/symbolic_reward_partial_score/std": 0.25561559200286865,
"rewards/tag_count_reward/mean": -0.05078125,
"rewards/tag_count_reward/std": 0.21976542472839355,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0415388345718384,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 161.8677520751953,
"sampling/sampling_logp_difference/mean": 0.3057638108730316,
"step": 877
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.2451568767428398,
"epoch": 2.3105263157894735,
"grad_norm": 0.012724120169878006,
"learning_rate": 1e-06,
"loss": 0.0798,
"step": 878
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.24493427574634552,
"epoch": 2.3131578947368423,
"grad_norm": 0.018414990976452827,
"learning_rate": 1e-06,
"loss": 0.1669,
"step": 879
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.24908612668514252,
"epoch": 2.3157894736842106,
"grad_norm": 0.00875798612833023,
"learning_rate": 1e-06,
"loss": 0.138,
"step": 880
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11232.0,
"completions/mean_length": 1375.77734375,
"completions/mean_terminated_length": 572.8682861328125,
"completions/min_length": 215.0,
"completions/min_terminated_length": 215.0,
"entropy": 0.2587193250656128,
"epoch": 2.318421052631579,
"frac_reward_zero_std": 0.28125,
"grad_norm": 12.977412223815918,
"learning_rate": 1e-06,
"loss": 0.1116,
"num_tokens": 272652592.0,
"reward": 0.7711009979248047,
"reward_std": 0.20819194614887238,
"rewards/progression_diversity/mean": -0.02857367694377899,
"rewards/progression_diversity/std": 0.11577697843313217,
"rewards/symbolic_reward_accuracy/mean": 0.83984375,
"rewards/symbolic_reward_accuracy/std": 0.3671095669269562,
"rewards/symbolic_reward_partial_score/mean": 0.9085286259651184,
"rewards/symbolic_reward_partial_score/std": 0.2529752552509308,
"rewards/tag_count_reward/mean": -0.05078125,
"rewards/tag_count_reward/std": 0.21976542472839355,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0420448780059814,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 166.0,
"sampling/sampling_logp_difference/mean": 0.29961156845092773,
"step": 881
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.26260629296302795,
"epoch": 2.3210526315789473,
"grad_norm": 0.06035393849015236,
"learning_rate": 1e-06,
"loss": 0.1662,
"step": 882
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.2572166621685028,
"epoch": 2.3236842105263156,
"grad_norm": 0.01509555708616972,
"learning_rate": 1e-06,
"loss": 0.0984,
"step": 883
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.2562931329011917,
"epoch": 2.3263157894736843,
"grad_norm": 0.009722933173179626,
"learning_rate": 1e-06,
"loss": 0.0405,
"step": 884
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11360.0,
"completions/mean_length": 1769.9296875,
"completions/mean_terminated_length": 531.4491577148438,
"completions/min_length": 187.0,
"completions/min_terminated_length": 187.0,
"entropy": 0.2543962597846985,
"epoch": 2.3289473684210527,
"frac_reward_zero_std": 0.3125,
"grad_norm": 8.712998390197754,
"learning_rate": 1e-06,
"loss": 0.1231,
"num_tokens": 273958092.0,
"reward": 0.7438181042671204,
"reward_std": 0.2017257809638977,
"rewards/progression_diversity/mean": -0.04201960563659668,
"rewards/progression_diversity/std": 0.1431008279323578,
"rewards/symbolic_reward_accuracy/mean": 0.810546875,
"rewards/symbolic_reward_accuracy/std": 0.3922513723373413,
"rewards/symbolic_reward_partial_score/mean": 0.8818359375,
"rewards/symbolic_reward_partial_score/std": 0.29238346219062805,
"rewards/tag_count_reward/mean": -0.06640625,
"rewards/tag_count_reward/std": 0.2492343932390213,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.041464924812317,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 171.0,
"sampling/sampling_logp_difference/mean": 0.3230324387550354,
"step": 885
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.24783600121736526,
"epoch": 2.331578947368421,
"grad_norm": 0.008411634713411331,
"learning_rate": 1e-06,
"loss": 0.0807,
"step": 886
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.24633554369211197,
"epoch": 2.3342105263157893,
"grad_norm": 0.013761989772319794,
"learning_rate": 1e-06,
"loss": 0.1864,
"step": 887
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.2572323977947235,
"epoch": 2.336842105263158,
"grad_norm": 0.011136609129607677,
"learning_rate": 1e-06,
"loss": 0.1732,
"step": 888
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15127.0,
"completions/mean_length": 1794.662109375,
"completions/mean_terminated_length": 756.9267578125,
"completions/min_length": 230.0,
"completions/min_terminated_length": 230.0,
"entropy": 0.274614155292511,
"epoch": 2.3394736842105264,
"frac_reward_zero_std": 0.28125,
"grad_norm": 30.201536178588867,
"learning_rate": 1e-06,
"loss": 0.1424,
"num_tokens": 275266591.0,
"reward": 0.7809537649154663,
"reward_std": 0.18957683444023132,
"rewards/progression_diversity/mean": -0.039390042424201965,
"rewards/progression_diversity/std": 0.1323193460702896,
"rewards/symbolic_reward_accuracy/mean": 0.861328125,
"rewards/symbolic_reward_accuracy/std": 0.34594178199768066,
"rewards/symbolic_reward_partial_score/mean": 0.9039713740348816,
"rewards/symbolic_reward_partial_score/std": 0.26959308981895447,
"rewards/tag_count_reward/mean": -0.06640625,
"rewards/tag_count_reward/std": 0.2492343932390213,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.047221302986145,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 186.931884765625,
"sampling/sampling_logp_difference/mean": 0.643196702003479,
"step": 889
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.2707149535417557,
"epoch": 2.3421052631578947,
"grad_norm": 0.010098773054778576,
"learning_rate": 1e-06,
"loss": 0.1054,
"step": 890
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.28222164511680603,
"epoch": 2.344736842105263,
"grad_norm": 0.3068772554397583,
"learning_rate": 1e-06,
"loss": 0.1973,
"step": 891
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.266815185546875,
"epoch": 2.3473684210526318,
"grad_norm": 1.143034815788269,
"learning_rate": 1e-06,
"loss": 0.126,
"step": 892
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12367.0,
"completions/mean_length": 1806.0625,
"completions/mean_terminated_length": 570.64404296875,
"completions/min_length": 222.0,
"completions/min_terminated_length": 222.0,
"entropy": 0.2788756787776947,
"epoch": 2.35,
"frac_reward_zero_std": 0.25,
"grad_norm": 49.109130859375,
"learning_rate": 1e-06,
"loss": 0.192,
"num_tokens": 276565887.0,
"reward": 0.7949280142784119,
"reward_std": 0.20325177907943726,
"rewards/progression_diversity/mean": -0.04333332180976868,
"rewards/progression_diversity/std": 0.14543607831001282,
"rewards/symbolic_reward_accuracy/mean": 0.880859375,
"rewards/symbolic_reward_accuracy/std": 0.32427072525024414,
"rewards/symbolic_reward_partial_score/mean": 0.91162109375,
"rewards/symbolic_reward_partial_score/std": 0.2654019296169281,
"rewards/tag_count_reward/mean": -0.06640625,
"rewards/tag_count_reward/std": 0.2492343932390213,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0455005168914795,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 178.99993896484375,
"sampling/sampling_logp_difference/mean": 0.7122431993484497,
"step": 893
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3671875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.453125,
"entropy": 0.2602469027042389,
"epoch": 2.3526315789473684,
"grad_norm": 0.005726585630327463,
"learning_rate": 1e-06,
"loss": 0.1203,
"step": 894
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.2623884826898575,
"epoch": 2.3552631578947367,
"grad_norm": 0.15292230248451233,
"learning_rate": 1e-06,
"loss": 0.0371,
"step": 895
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.2809968888759613,
"epoch": 2.3578947368421055,
"grad_norm": 0.005319634452462196,
"learning_rate": 1e-06,
"loss": 0.2592,
"step": 896
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14861.0,
"completions/mean_length": 2606.03515625,
"completions/mean_terminated_length": 777.1017456054688,
"completions/min_length": 230.0,
"completions/min_terminated_length": 230.0,
"entropy": 0.28651466965675354,
"epoch": 2.360526315789474,
"frac_reward_zero_std": 0.09375,
"grad_norm": 24.261709213256836,
"learning_rate": 1e-06,
"loss": 0.1616,
"num_tokens": 278317681.0,
"reward": 0.7092536687850952,
"reward_std": 0.2737279534339905,
"rewards/progression_diversity/mean": -0.05608125776052475,
"rewards/progression_diversity/std": 0.1531989425420761,
"rewards/symbolic_reward_accuracy/mean": 0.78125,
"rewards/symbolic_reward_accuracy/std": 0.41380295157432556,
"rewards/symbolic_reward_partial_score/mean": 0.8426106572151184,
"rewards/symbolic_reward_partial_score/std": 0.3354193866252899,
"rewards/tag_count_reward/mean": -0.1171875,
"rewards/tag_count_reward/std": 0.32195815443992615,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.045390248298645,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 180.99990844726562,
"sampling/sampling_logp_difference/mean": 0.8015948534011841,
"step": 897
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.30110684037208557,
"epoch": 2.363157894736842,
"grad_norm": 4.408512115478516,
"learning_rate": 1e-06,
"loss": 0.2523,
"step": 898
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3984375,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5234375,
"entropy": 0.2703721821308136,
"epoch": 2.3657894736842104,
"grad_norm": 0.4832998216152191,
"learning_rate": 1e-06,
"loss": 0.1279,
"step": 899
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.4140625,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.53125,
"entropy": 0.28015589714050293,
"epoch": 2.3684210526315788,
"grad_norm": 0.007808802183717489,
"learning_rate": 1e-06,
"loss": 0.1913,
"step": 900
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12939.0,
"completions/mean_length": 1535.99609375,
"completions/mean_terminated_length": 805.766357421875,
"completions/min_length": 197.0,
"completions/min_terminated_length": 197.0,
"entropy": 0.2482433319091797,
"epoch": 2.3710526315789475,
"frac_reward_zero_std": 0.25,
"grad_norm": 15.909863471984863,
"learning_rate": 1e-06,
"loss": 0.0303,
"num_tokens": 279514095.0,
"reward": 0.775747537612915,
"reward_std": 0.22888442873954773,
"rewards/progression_diversity/mean": -0.032673317939043045,
"rewards/progression_diversity/std": 0.12282172590494156,
"rewards/symbolic_reward_accuracy/mean": 0.8515625,
"rewards/symbolic_reward_accuracy/std": 0.35588082671165466,
"rewards/symbolic_reward_partial_score/mean": 0.9013671875,
"rewards/symbolic_reward_partial_score/std": 0.26712751388549805,
"rewards/tag_count_reward/mean": -0.052734375,
"rewards/tag_count_reward/std": 0.22372129559516907,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0461652278900146,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 189.7768096923828,
"sampling/sampling_logp_difference/mean": 0.7810826301574707,
"step": 901
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.275927871465683,
"epoch": 2.373684210526316,
"grad_norm": 0.009297163225710392,
"learning_rate": 1e-06,
"loss": 0.1966,
"step": 902
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.2581518739461899,
"epoch": 2.376315789473684,
"grad_norm": 0.01285830419510603,
"learning_rate": 1e-06,
"loss": 0.1598,
"step": 903
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.27708950638771057,
"epoch": 2.3789473684210525,
"grad_norm": 0.01762818545103073,
"learning_rate": 1e-06,
"loss": 0.2154,
"step": 904
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12266.0,
"completions/mean_length": 1788.240234375,
"completions/mean_terminated_length": 684.3592529296875,
"completions/min_length": 227.0,
"completions/min_terminated_length": 227.0,
"entropy": 0.2828632742166519,
"epoch": 2.3815789473684212,
"frac_reward_zero_std": 0.1875,
"grad_norm": 28.789213180541992,
"learning_rate": 1e-06,
"loss": 0.1664,
"num_tokens": 280833482.0,
"reward": 0.7678719162940979,
"reward_std": 0.19974274933338165,
"rewards/progression_diversity/mean": -0.034103699028491974,
"rewards/progression_diversity/std": 0.11745806038379669,
"rewards/symbolic_reward_accuracy/mean": 0.83984375,
"rewards/symbolic_reward_accuracy/std": 0.3671095669269562,
"rewards/symbolic_reward_partial_score/mean": 0.9051106572151184,
"rewards/symbolic_reward_partial_score/std": 0.2643766403198242,
"rewards/tag_count_reward/mean": -0.072265625,
"rewards/tag_count_reward/std": 0.2591804563999176,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0446805953979492,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 187.5076446533203,
"sampling/sampling_logp_difference/mean": 0.7177349328994751,
"step": 905
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.2618148922920227,
"epoch": 2.3842105263157896,
"grad_norm": 0.5371949672698975,
"learning_rate": 1e-06,
"loss": 0.1672,
"step": 906
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3359375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.26834670454263687,
"epoch": 2.386842105263158,
"grad_norm": 0.008967792615294456,
"learning_rate": 1e-06,
"loss": 0.0927,
"step": 907
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.4140625,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.515625,
"entropy": 0.2814731001853943,
"epoch": 2.389473684210526,
"grad_norm": 1.3082025051116943,
"learning_rate": 1e-06,
"loss": 0.1104,
"step": 908
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11597.0,
"completions/mean_length": 2223.818359375,
"completions/mean_terminated_length": 826.0321655273438,
"completions/min_length": 206.0,
"completions/min_terminated_length": 206.0,
"entropy": 0.26080475747585297,
"epoch": 2.3921052631578945,
"frac_reward_zero_std": 0.15625,
"grad_norm": 56.598018646240234,
"learning_rate": 1e-06,
"loss": 0.1662,
"num_tokens": 282405421.0,
"reward": 0.7159743309020996,
"reward_std": 0.26898401975631714,
"rewards/progression_diversity/mean": -0.04807615280151367,
"rewards/progression_diversity/std": 0.13962800800800323,
"rewards/symbolic_reward_accuracy/mean": 0.783203125,
"rewards/symbolic_reward_accuracy/std": 0.4124660789966583,
"rewards/symbolic_reward_partial_score/mean": 0.8575846552848816,
"rewards/symbolic_reward_partial_score/std": 0.31706979870796204,
"rewards/tag_count_reward/mean": -0.107421875,
"rewards/tag_count_reward/std": 0.30995169281959534,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0462749004364014,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 186.0,
"sampling/sampling_logp_difference/mean": 0.7766126990318298,
"step": 909
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.27692703902721405,
"epoch": 2.3947368421052633,
"grad_norm": 3.8725080490112305,
"learning_rate": 1e-06,
"loss": 0.1265,
"step": 910
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.2723271772265434,
"epoch": 2.3973684210526316,
"grad_norm": 3.388723611831665,
"learning_rate": 1e-06,
"loss": 0.1184,
"step": 911
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.1953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4609375,
"entropy": 0.2788117825984955,
"epoch": 2.4,
"grad_norm": 0.0120708541944623,
"learning_rate": 1e-06,
"loss": 0.2389,
"step": 912
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11453.0,
"completions/mean_length": 1910.248046875,
"completions/mean_terminated_length": 616.8489379882812,
"completions/min_length": 245.0,
"completions/min_terminated_length": 245.0,
"entropy": 0.2541096359491348,
"epoch": 2.4026315789473682,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.020459089428186417,
"learning_rate": 1e-06,
"loss": 0.0879,
"num_tokens": 283807468.0,
"reward": 0.7370096445083618,
"reward_std": 0.23636674880981445,
"rewards/progression_diversity/mean": -0.03927619010210037,
"rewards/progression_diversity/std": 0.13120803236961365,
"rewards/symbolic_reward_accuracy/mean": 0.802734375,
"rewards/symbolic_reward_accuracy/std": 0.3983237147331238,
"rewards/symbolic_reward_partial_score/mean": 0.8798828125,
"rewards/symbolic_reward_partial_score/std": 0.28891611099243164,
"rewards/tag_count_reward/mean": -0.08203125,
"rewards/tag_count_reward/std": 0.2746807038784027,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0478408336639404,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 186.0,
"sampling/sampling_logp_difference/mean": 0.6119551658630371,
"step": 913
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.28826163709163666,
"epoch": 2.405263157894737,
"grad_norm": 10.523284912109375,
"learning_rate": 1e-06,
"loss": 0.2367,
"step": 914
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.2593996897339821,
"epoch": 2.4078947368421053,
"grad_norm": 0.012641196139156818,
"learning_rate": 1e-06,
"loss": 0.1789,
"step": 915
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3359375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.2622472196817398,
"epoch": 2.4105263157894736,
"grad_norm": 0.012980838306248188,
"learning_rate": 1e-06,
"loss": 0.0986,
"step": 916
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10840.0,
"completions/mean_length": 1772.17578125,
"completions/mean_terminated_length": 667.0798950195312,
"completions/min_length": 170.0,
"completions/min_terminated_length": 170.0,
"entropy": 0.2561643496155739,
"epoch": 2.413157894736842,
"frac_reward_zero_std": 0.25,
"grad_norm": 34.25957489013672,
"learning_rate": 1e-06,
"loss": 0.0891,
"num_tokens": 285117126.0,
"reward": 0.7429047226905823,
"reward_std": 0.22891083359718323,
"rewards/progression_diversity/mean": -0.040586501359939575,
"rewards/progression_diversity/std": 0.14015312492847443,
"rewards/symbolic_reward_accuracy/mean": 0.810546875,
"rewards/symbolic_reward_accuracy/std": 0.3922513723373413,
"rewards/symbolic_reward_partial_score/mean": 0.8826497793197632,
"rewards/symbolic_reward_partial_score/std": 0.28473344445228577,
"rewards/tag_count_reward/mean": -0.078125,
"rewards/tag_count_reward/std": 0.26863065361976624,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0470426082611084,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 185.0,
"sampling/sampling_logp_difference/mean": 0.5546722412109375,
"step": 917
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.2671400457620621,
"epoch": 2.4157894736842107,
"grad_norm": 0.01011139526963234,
"learning_rate": 1e-06,
"loss": 0.1876,
"step": 918
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.25694411993026733,
"epoch": 2.418421052631579,
"grad_norm": 0.011605838313698769,
"learning_rate": 1e-06,
"loss": 0.1389,
"step": 919
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.2727016508579254,
"epoch": 2.4210526315789473,
"grad_norm": 1.0605076551437378,
"learning_rate": 1e-06,
"loss": 0.167,
"step": 920
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.064453125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11618.0,
"completions/mean_length": 1737.193359375,
"completions/mean_terminated_length": 728.1231689453125,
"completions/min_length": 239.0,
"completions/min_terminated_length": 239.0,
"entropy": 0.26561206579208374,
"epoch": 2.4236842105263157,
"frac_reward_zero_std": 0.25,
"grad_norm": 114.35115814208984,
"learning_rate": 1e-06,
"loss": 0.1868,
"num_tokens": 286425737.0,
"reward": 0.7702474594116211,
"reward_std": 0.19327302277088165,
"rewards/progression_diversity/mean": -0.03580557554960251,
"rewards/progression_diversity/std": 0.12353134900331497,
"rewards/symbolic_reward_accuracy/mean": 0.84375,
"rewards/symbolic_reward_accuracy/std": 0.36344730854034424,
"rewards/symbolic_reward_partial_score/mean": 0.9000650644302368,
"rewards/symbolic_reward_partial_score/std": 0.2719910442829132,
"rewards/tag_count_reward/mean": -0.056640625,
"rewards/tag_count_reward/std": 0.23138070106506348,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0514814853668213,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 187.99998474121094,
"sampling/sampling_logp_difference/mean": 0.6879295110702515,
"step": 921
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.2657042294740677,
"epoch": 2.4263157894736844,
"grad_norm": 4.603257179260254,
"learning_rate": 1e-06,
"loss": 0.0912,
"step": 922
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.2845795154571533,
"epoch": 2.4289473684210527,
"grad_norm": 0.0070864069275557995,
"learning_rate": 1e-06,
"loss": 0.1715,
"step": 923
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.453125,
"entropy": 0.27537743747234344,
"epoch": 2.431578947368421,
"grad_norm": 0.988322377204895,
"learning_rate": 1e-06,
"loss": 0.1607,
"step": 924
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11161.0,
"completions/mean_length": 1345.548828125,
"completions/mean_terminated_length": 605.9528198242188,
"completions/min_length": 237.0,
"completions/min_terminated_length": 237.0,
"entropy": 0.25172994285821915,
"epoch": 2.4342105263157894,
"frac_reward_zero_std": 0.4375,
"grad_norm": 17.042478561401367,
"learning_rate": 1e-06,
"loss": 0.1133,
"num_tokens": 287528290.0,
"reward": 0.8133134841918945,
"reward_std": 0.15900012850761414,
"rewards/progression_diversity/mean": -0.030958127230405807,
"rewards/progression_diversity/std": 0.12952972948551178,
"rewards/symbolic_reward_accuracy/mean": 0.8984375,
"rewards/symbolic_reward_accuracy/std": 0.30236753821372986,
"rewards/symbolic_reward_partial_score/mean": 0.9288737177848816,
"rewards/symbolic_reward_partial_score/std": 0.23390844464302063,
"rewards/tag_count_reward/mean": -0.041015625,
"rewards/tag_count_reward/std": 0.19852031767368317,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.050032615661621,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 187.0,
"sampling/sampling_logp_difference/mean": 0.4537726044654846,
"step": 925
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.25580619275569916,
"epoch": 2.4368421052631577,
"grad_norm": 0.008091673254966736,
"learning_rate": 1e-06,
"loss": 0.1041,
"step": 926
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.2634577751159668,
"epoch": 2.4394736842105265,
"grad_norm": 0.019041819497942924,
"learning_rate": 1e-06,
"loss": 0.1278,
"step": 927
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.2495371326804161,
"epoch": 2.442105263157895,
"grad_norm": 0.00901293195784092,
"learning_rate": 1e-06,
"loss": 0.1001,
"step": 928
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.095703125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10453.0,
"completions/mean_length": 2109.220703125,
"completions/mean_terminated_length": 598.4989013671875,
"completions/min_length": 213.0,
"completions/min_terminated_length": 213.0,
"entropy": 0.26799434423446655,
"epoch": 2.444736842105263,
"frac_reward_zero_std": 0.21875,
"grad_norm": 14.625404357910156,
"learning_rate": 1e-06,
"loss": 0.1196,
"num_tokens": 289010675.0,
"reward": 0.7222642302513123,
"reward_std": 0.21547727286815643,
"rewards/progression_diversity/mean": -0.04896814748644829,
"rewards/progression_diversity/std": 0.15030057728290558,
"rewards/symbolic_reward_accuracy/mean": 0.783203125,
"rewards/symbolic_reward_accuracy/std": 0.4124660789966583,
"rewards/symbolic_reward_partial_score/mean": 0.8688151240348816,
"rewards/symbolic_reward_partial_score/std": 0.29783180356025696,
"rewards/tag_count_reward/mean": -0.078125,
"rewards/tag_count_reward/std": 0.26863065361976624,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0500848293304443,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 184.99998474121094,
"sampling/sampling_logp_difference/mean": 0.47443127632141113,
"step": 929
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.26960813999176025,
"epoch": 2.4473684210526314,
"grad_norm": 1.1039235591888428,
"learning_rate": 1e-06,
"loss": 0.1318,
"step": 930
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.453125,
"entropy": 0.26754334568977356,
"epoch": 2.45,
"grad_norm": 0.5074542164802551,
"learning_rate": 1e-06,
"loss": 0.126,
"step": 931
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.2775499224662781,
"epoch": 2.4526315789473685,
"grad_norm": 0.007839949801564217,
"learning_rate": 1e-06,
"loss": 0.1686,
"step": 932
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11585.0,
"completions/mean_length": 2094.462890625,
"completions/mean_terminated_length": 683.90771484375,
"completions/min_length": 225.0,
"completions/min_terminated_length": 225.0,
"entropy": 0.2719879597425461,
"epoch": 2.455263157894737,
"frac_reward_zero_std": 0.125,
"grad_norm": 22.145875930786133,
"learning_rate": 1e-06,
"loss": 0.1494,
"num_tokens": 290511872.0,
"reward": 0.7216647267341614,
"reward_std": 0.2439390867948532,
"rewards/progression_diversity/mean": -0.05032936483621597,
"rewards/progression_diversity/std": 0.15467680990695953,
"rewards/symbolic_reward_accuracy/mean": 0.78515625,
"rewards/symbolic_reward_accuracy/std": 0.4111155867576599,
"rewards/symbolic_reward_partial_score/mean": 0.8629556894302368,
"rewards/symbolic_reward_partial_score/std": 0.29924580454826355,
"rewards/tag_count_reward/mean": -0.078125,
"rewards/tag_count_reward/std": 0.26863065361976624,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0494955778121948,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 185.0,
"sampling/sampling_logp_difference/mean": 0.5632617473602295,
"step": 933
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.2774350792169571,
"epoch": 2.457894736842105,
"grad_norm": 9.17587947845459,
"learning_rate": 1e-06,
"loss": 0.1804,
"step": 934
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.2561372071504593,
"epoch": 2.4605263157894735,
"grad_norm": 0.012559008784592152,
"learning_rate": 1e-06,
"loss": 0.1994,
"step": 935
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.27159862220287323,
"epoch": 2.463157894736842,
"grad_norm": 0.4640989601612091,
"learning_rate": 1e-06,
"loss": 0.1847,
"step": 936
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9495.0,
"completions/mean_length": 1562.392578125,
"completions/mean_terminated_length": 508.1359558105469,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"entropy": 0.24451758712530136,
"epoch": 2.4657894736842105,
"frac_reward_zero_std": 0.28125,
"grad_norm": 7.657289505004883,
"learning_rate": 1e-06,
"loss": 0.1219,
"num_tokens": 291718473.0,
"reward": 0.7819766998291016,
"reward_std": 0.22530367970466614,
"rewards/progression_diversity/mean": -0.03475029766559601,
"rewards/progression_diversity/std": 0.13393589854240417,
"rewards/symbolic_reward_accuracy/mean": 0.861328125,
"rewards/symbolic_reward_accuracy/std": 0.34594178199768066,
"rewards/symbolic_reward_partial_score/mean": 0.9059244394302368,
"rewards/symbolic_reward_partial_score/std": 0.2685166001319885,
"rewards/tag_count_reward/mean": -0.0625,
"rewards/tag_count_reward/std": 0.2422981858253479,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0499627590179443,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 183.99996948242188,
"sampling/sampling_logp_difference/mean": 0.395052433013916,
"step": 937
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.26470696926116943,
"epoch": 2.468421052631579,
"grad_norm": 14.9205961227417,
"learning_rate": 1e-06,
"loss": 0.1575,
"step": 938
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.25287581980228424,
"epoch": 2.4710526315789476,
"grad_norm": 0.30867093801498413,
"learning_rate": 1e-06,
"loss": 0.0946,
"step": 939
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.2676776349544525,
"epoch": 2.473684210526316,
"grad_norm": 0.011630654335021973,
"learning_rate": 1e-06,
"loss": 0.164,
"step": 940
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11695.0,
"completions/mean_length": 1605.857421875,
"completions/mean_terminated_length": 620.64794921875,
"completions/min_length": 235.0,
"completions/min_terminated_length": 235.0,
"entropy": 0.2720135450363159,
"epoch": 2.4763157894736842,
"frac_reward_zero_std": 0.25,
"grad_norm": 22.873489379882812,
"learning_rate": 1e-06,
"loss": 0.1462,
"num_tokens": 292933632.0,
"reward": 0.7649558186531067,
"reward_std": 0.22175286710262299,
"rewards/progression_diversity/mean": -0.04250683635473251,
"rewards/progression_diversity/std": 0.15298283100128174,
"rewards/symbolic_reward_accuracy/mean": 0.833984375,
"rewards/symbolic_reward_accuracy/std": 0.3724585771560669,
"rewards/symbolic_reward_partial_score/mean": 0.90283203125,
"rewards/symbolic_reward_partial_score/std": 0.2671821117401123,
"rewards/tag_count_reward/mean": -0.05859375,
"rewards/tag_count_reward/std": 0.23509246110916138,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0548969507217407,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 186.0,
"sampling/sampling_logp_difference/mean": 0.43102023005485535,
"step": 941
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.2624838799238205,
"epoch": 2.4789473684210526,
"grad_norm": 0.00974233727902174,
"learning_rate": 1e-06,
"loss": 0.0969,
"step": 942
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.2723797559738159,
"epoch": 2.481578947368421,
"grad_norm": 0.013580858707427979,
"learning_rate": 1e-06,
"loss": 0.1871,
"step": 943
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.27662527561187744,
"epoch": 2.4842105263157896,
"grad_norm": 0.8762795329093933,
"learning_rate": 1e-06,
"loss": 0.1239,
"step": 944
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.103515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12230.0,
"completions/mean_length": 2296.412109375,
"completions/mean_terminated_length": 669.74072265625,
"completions/min_length": 224.0,
"completions/min_terminated_length": 224.0,
"entropy": 0.2892232984304428,
"epoch": 2.486842105263158,
"frac_reward_zero_std": 0.125,
"grad_norm": 16.07674217224121,
"learning_rate": 1e-06,
"loss": 0.3197,
"num_tokens": 294500691.0,
"reward": 0.721849799156189,
"reward_std": 0.270133912563324,
"rewards/progression_diversity/mean": -0.061112575232982635,
"rewards/progression_diversity/std": 0.17030061781406403,
"rewards/symbolic_reward_accuracy/mean": 0.79296875,
"rewards/symbolic_reward_accuracy/std": 0.40557438135147095,
"rewards/symbolic_reward_partial_score/mean": 0.8528646230697632,
"rewards/symbolic_reward_partial_score/std": 0.327158123254776,
"rewards/tag_count_reward/mean": -0.091796875,
"rewards/tag_count_reward/std": 0.289021372795105,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.061535120010376,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 182.99998474121094,
"sampling/sampling_logp_difference/mean": 0.5773719549179077,
"step": 945
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.2695477306842804,
"epoch": 2.4894736842105263,
"grad_norm": 0.016710912808775902,
"learning_rate": 1e-06,
"loss": 0.1135,
"step": 946
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.46875,
"entropy": 0.2837250232696533,
"epoch": 2.4921052631578946,
"grad_norm": 2.493502616882324,
"learning_rate": 1e-06,
"loss": 0.1442,
"step": 947
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.484375,
"entropy": 0.28005383908748627,
"epoch": 2.4947368421052634,
"grad_norm": 1.3186397552490234,
"learning_rate": 1e-06,
"loss": 0.2604,
"step": 948
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12417.0,
"completions/mean_length": 2161.0078125,
"completions/mean_terminated_length": 621.7229614257812,
"completions/min_length": 212.0,
"completions/min_terminated_length": 212.0,
"entropy": 0.2822905331850052,
"epoch": 2.4973684210526317,
"frac_reward_zero_std": 0.28125,
"grad_norm": 14.119881629943848,
"learning_rate": 1e-06,
"loss": 0.1365,
"num_tokens": 296000247.0,
"reward": 0.7538953423500061,
"reward_std": 0.20081034302711487,
"rewards/progression_diversity/mean": -0.04991994798183441,
"rewards/progression_diversity/std": 0.15201660990715027,
"rewards/symbolic_reward_accuracy/mean": 0.83203125,
"rewards/symbolic_reward_accuracy/std": 0.374204158782959,
"rewards/symbolic_reward_partial_score/mean": 0.8811848759651184,
"rewards/symbolic_reward_partial_score/std": 0.3015502393245697,
"rewards/tag_count_reward/mean": -0.091796875,
"rewards/tag_count_reward/std": 0.289021372795105,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0665756464004517,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 187.9999542236328,
"sampling/sampling_logp_difference/mean": 0.5656195282936096,
"step": 949
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.28284990787506104,
"epoch": 2.5,
"grad_norm": 2.7521109580993652,
"learning_rate": 1e-06,
"loss": 0.118,
"step": 950
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.2816344350576401,
"epoch": 2.5026315789473683,
"grad_norm": 2.637901544570923,
"learning_rate": 1e-06,
"loss": 0.1765,
"step": 951
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.2915927320718765,
"epoch": 2.5052631578947366,
"grad_norm": 0.009813666343688965,
"learning_rate": 1e-06,
"loss": 0.1735,
"step": 952
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10413.0,
"completions/mean_length": 1986.9765625,
"completions/mean_terminated_length": 565.8111572265625,
"completions/min_length": 240.0,
"completions/min_terminated_length": 240.0,
"entropy": 0.26933328807353973,
"epoch": 2.5078947368421054,
"frac_reward_zero_std": 0.3125,
"grad_norm": 31.982301712036133,
"learning_rate": 1e-06,
"loss": 0.1037,
"num_tokens": 297408875.0,
"reward": 0.7207555770874023,
"reward_std": 0.19293510913848877,
"rewards/progression_diversity/mean": -0.04358455538749695,
"rewards/progression_diversity/std": 0.14303399622440338,
"rewards/symbolic_reward_accuracy/mean": 0.783203125,
"rewards/symbolic_reward_accuracy/std": 0.4124660789966583,
"rewards/symbolic_reward_partial_score/mean": 0.8675130605697632,
"rewards/symbolic_reward_partial_score/std": 0.29670462012290955,
"rewards/tag_count_reward/mean": -0.08984375,
"rewards/tag_count_reward/std": 0.2862374484539032,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.059614658355713,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 190.99974060058594,
"sampling/sampling_logp_difference/mean": 0.5123655200004578,
"step": 953
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.2840705066919327,
"epoch": 2.5105263157894737,
"grad_norm": 9.22028923034668,
"learning_rate": 1e-06,
"loss": 0.2415,
"step": 954
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.2771739959716797,
"epoch": 2.513157894736842,
"grad_norm": 1.641637921333313,
"learning_rate": 1e-06,
"loss": 0.1161,
"step": 955
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.27851031720638275,
"epoch": 2.515789473684211,
"grad_norm": 2.4652910232543945,
"learning_rate": 1e-06,
"loss": 0.1288,
"step": 956
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13128.0,
"completions/mean_length": 1963.44921875,
"completions/mean_terminated_length": 674.8042602539062,
"completions/min_length": 235.0,
"completions/min_terminated_length": 235.0,
"entropy": 0.2786627262830734,
"epoch": 2.518421052631579,
"frac_reward_zero_std": 0.34375,
"grad_norm": 18.040977478027344,
"learning_rate": 1e-06,
"loss": 0.0985,
"num_tokens": 298796273.0,
"reward": 0.7595969438552856,
"reward_std": 0.18789124488830566,
"rewards/progression_diversity/mean": -0.041288819164037704,
"rewards/progression_diversity/std": 0.13274167478084564,
"rewards/symbolic_reward_accuracy/mean": 0.837890625,
"rewards/symbolic_reward_accuracy/std": 0.3689115643501282,
"rewards/symbolic_reward_partial_score/mean": 0.88623046875,
"rewards/symbolic_reward_partial_score/std": 0.29010042548179626,
"rewards/tag_count_reward/mean": -0.0859375,
"rewards/tag_count_reward/std": 0.28054583072662354,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.061213493347168,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 190.0,
"sampling/sampling_logp_difference/mean": 0.4693317413330078,
"step": 957
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.2739100754261017,
"epoch": 2.5210526315789474,
"grad_norm": 0.009779985062777996,
"learning_rate": 1e-06,
"loss": 0.1521,
"step": 958
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.295116662979126,
"epoch": 2.5236842105263158,
"grad_norm": 0.010075357742607594,
"learning_rate": 1e-06,
"loss": 0.1778,
"step": 959
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.27153341472148895,
"epoch": 2.526315789473684,
"grad_norm": 0.01223987527191639,
"learning_rate": 1e-06,
"loss": 0.1236,
"step": 960
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14244.0,
"completions/mean_length": 2309.912109375,
"completions/mean_terminated_length": 786.742431640625,
"completions/min_length": 260.0,
"completions/min_terminated_length": 260.0,
"entropy": 0.30197782814502716,
"epoch": 2.5289473684210524,
"frac_reward_zero_std": 0.15625,
"grad_norm": 25.759937286376953,
"learning_rate": 1e-06,
"loss": 0.1054,
"num_tokens": 300395268.0,
"reward": 0.6984215378761292,
"reward_std": 0.22170159220695496,
"rewards/progression_diversity/mean": -0.0504293292760849,
"rewards/progression_diversity/std": 0.14640994369983673,
"rewards/symbolic_reward_accuracy/mean": 0.755859375,
"rewards/symbolic_reward_accuracy/std": 0.42999663949012756,
"rewards/symbolic_reward_partial_score/mean": 0.8512369394302368,
"rewards/symbolic_reward_partial_score/std": 0.3122048079967499,
"rewards/tag_count_reward/mean": -0.099609375,
"rewards/tag_count_reward/std": 0.29977133870124817,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0820285081863403,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 189.9932861328125,
"sampling/sampling_logp_difference/mean": 0.6296501159667969,
"step": 961
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.359375,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.484375,
"entropy": 0.31053292751312256,
"epoch": 2.531578947368421,
"grad_norm": 0.010956598445773125,
"learning_rate": 1e-06,
"loss": 0.197,
"step": 962
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4765625,
"entropy": 0.3268131613731384,
"epoch": 2.5342105263157895,
"grad_norm": 4.867886543273926,
"learning_rate": 1e-06,
"loss": 0.1752,
"step": 963
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.29976852238178253,
"epoch": 2.536842105263158,
"grad_norm": 0.28594204783439636,
"learning_rate": 1e-06,
"loss": 0.1525,
"step": 964
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14171.0,
"completions/mean_length": 1403.908203125,
"completions/mean_terminated_length": 667.1823120117188,
"completions/min_length": 216.0,
"completions/min_terminated_length": 216.0,
"entropy": 0.3206019401550293,
"epoch": 2.5394736842105265,
"frac_reward_zero_std": 0.34375,
"grad_norm": 21.9819393157959,
"learning_rate": 1e-06,
"loss": 0.1878,
"num_tokens": 301511541.0,
"reward": 0.8047443628311157,
"reward_std": 0.16195279359817505,
"rewards/progression_diversity/mean": -0.028497815132141113,
"rewards/progression_diversity/std": 0.1169213354587555,
"rewards/symbolic_reward_accuracy/mean": 0.8828125,
"rewards/symbolic_reward_accuracy/std": 0.32195815443992615,
"rewards/symbolic_reward_partial_score/mean": 0.9347330331802368,
"rewards/symbolic_reward_partial_score/std": 0.21818473935127258,
"rewards/tag_count_reward/mean": -0.05078125,
"rewards/tag_count_reward/std": 0.21976542472839355,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0805866718292236,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 188.99998474121094,
"sampling/sampling_logp_difference/mean": 0.5423262119293213,
"step": 965
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.28471747040748596,
"epoch": 2.542105263157895,
"grad_norm": 0.39148885011672974,
"learning_rate": 1e-06,
"loss": 0.0897,
"step": 966
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.27406057715415955,
"epoch": 2.544736842105263,
"grad_norm": 0.004555261693894863,
"learning_rate": 1e-06,
"loss": 0.0543,
"step": 967
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.29411862790584564,
"epoch": 2.5473684210526315,
"grad_norm": 0.011435529217123985,
"learning_rate": 1e-06,
"loss": 0.1264,
"step": 968
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13752.0,
"completions/mean_length": 1564.11328125,
"completions/mean_terminated_length": 641.7137451171875,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"entropy": 0.31351859867572784,
"epoch": 2.55,
"frac_reward_zero_std": 0.375,
"grad_norm": 25.21331787109375,
"learning_rate": 1e-06,
"loss": 0.1539,
"num_tokens": 302711503.0,
"reward": 0.7893426418304443,
"reward_std": 0.18001574277877808,
"rewards/progression_diversity/mean": -0.03058181144297123,
"rewards/progression_diversity/std": 0.11552340537309647,
"rewards/symbolic_reward_accuracy/mean": 0.87109375,
"rewards/symbolic_reward_accuracy/std": 0.33542385697364807,
"rewards/symbolic_reward_partial_score/mean": 0.9108072519302368,
"rewards/symbolic_reward_partial_score/std": 0.25809013843536377,
"rewards/tag_count_reward/mean": -0.0625,
"rewards/tag_count_reward/std": 0.2422981858253479,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0829228162765503,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 189.0,
"sampling/sampling_logp_difference/mean": 0.5267312526702881,
"step": 969
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.2877921462059021,
"epoch": 2.5526315789473686,
"grad_norm": 0.15190419554710388,
"learning_rate": 1e-06,
"loss": 0.0471,
"step": 970
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.30551964044570923,
"epoch": 2.555263157894737,
"grad_norm": 0.005192040465772152,
"learning_rate": 1e-06,
"loss": 0.1395,
"step": 971
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.29702240228652954,
"epoch": 2.557894736842105,
"grad_norm": 0.016318701207637787,
"learning_rate": 1e-06,
"loss": 0.0983,
"step": 972
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11243.0,
"completions/mean_length": 1504.9765625,
"completions/mean_terminated_length": 578.8963012695312,
"completions/min_length": 247.0,
"completions/min_terminated_length": 247.0,
"entropy": 0.27735981345176697,
"epoch": 2.5605263157894735,
"frac_reward_zero_std": 0.34375,
"grad_norm": 5.76010274887085,
"learning_rate": 1e-06,
"loss": 0.0696,
"num_tokens": 303875171.0,
"reward": 0.7819315195083618,
"reward_std": 0.1941755712032318,
"rewards/progression_diversity/mean": -0.029511984437704086,
"rewards/progression_diversity/std": 0.11651825904846191,
"rewards/symbolic_reward_accuracy/mean": 0.859375,
"rewards/symbolic_reward_accuracy/std": 0.3479743003845215,
"rewards/symbolic_reward_partial_score/mean": 0.908203125,
"rewards/symbolic_reward_partial_score/std": 0.25990694761276245,
"rewards/tag_count_reward/mean": -0.05859375,
"rewards/tag_count_reward/std": 0.23509246110916138,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0842504501342773,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 192.0,
"sampling/sampling_logp_difference/mean": 0.5295515060424805,
"step": 973
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.33016660809516907,
"epoch": 2.5631578947368423,
"grad_norm": 0.0073794652707874775,
"learning_rate": 1e-06,
"loss": 0.201,
"step": 974
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.287723571062088,
"epoch": 2.5657894736842106,
"grad_norm": 0.01062384806573391,
"learning_rate": 1e-06,
"loss": 0.139,
"step": 975
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.2862458676099777,
"epoch": 2.568421052631579,
"grad_norm": 0.01843968592584133,
"learning_rate": 1e-06,
"loss": 0.0968,
"step": 976
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13225.0,
"completions/mean_length": 1550.58203125,
"completions/mean_terminated_length": 627.3402709960938,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"entropy": 0.30991949141025543,
"epoch": 2.5710526315789473,
"frac_reward_zero_std": 0.3125,
"grad_norm": 35.655460357666016,
"learning_rate": 1e-06,
"loss": 0.1447,
"num_tokens": 305055693.0,
"reward": 0.7691850662231445,
"reward_std": 0.20995613932609558,
"rewards/progression_diversity/mean": -0.02974330447614193,
"rewards/progression_diversity/std": 0.1124248057603836,
"rewards/symbolic_reward_accuracy/mean": 0.84375,
"rewards/symbolic_reward_accuracy/std": 0.36344730854034424,
"rewards/symbolic_reward_partial_score/mean": 0.8976237177848816,
"rewards/symbolic_reward_partial_score/std": 0.27150553464889526,
"rewards/tag_count_reward/mean": -0.060546875,
"rewards/tag_count_reward/std": 0.2387305200099945,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0818887948989868,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 199.03089904785156,
"sampling/sampling_logp_difference/mean": 0.4938344359397888,
"step": 977
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.2864755541086197,
"epoch": 2.5736842105263156,
"grad_norm": 0.02253652736544609,
"learning_rate": 1e-06,
"loss": 0.0692,
"step": 978
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.28972767293453217,
"epoch": 2.5763157894736843,
"grad_norm": 0.007194779813289642,
"learning_rate": 1e-06,
"loss": 0.1079,
"step": 979
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.30334651470184326,
"epoch": 2.5789473684210527,
"grad_norm": 0.007408217992633581,
"learning_rate": 1e-06,
"loss": 0.1557,
"step": 980
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.052734375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13332.0,
"completions/mean_length": 1399.427734375,
"completions/mean_terminated_length": 565.2350463867188,
"completions/min_length": 191.0,
"completions/min_terminated_length": 191.0,
"entropy": 0.2880914658308029,
"epoch": 2.581578947368421,
"frac_reward_zero_std": 0.3125,
"grad_norm": 79.71167755126953,
"learning_rate": 1e-06,
"loss": 0.1494,
"num_tokens": 306174184.0,
"reward": 0.8039476275444031,
"reward_std": 0.21047255396842957,
"rewards/progression_diversity/mean": -0.025160329416394234,
"rewards/progression_diversity/std": 0.10529939085245132,
"rewards/symbolic_reward_accuracy/mean": 0.884765625,
"rewards/symbolic_reward_accuracy/std": 0.3196168541908264,
"rewards/symbolic_reward_partial_score/mean": 0.9241536855697632,
"rewards/symbolic_reward_partial_score/std": 0.23598654568195343,
"rewards/tag_count_reward/mean": -0.0390625,
"rewards/tag_count_reward/std": 0.1939331740140915,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0879991054534912,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 199.0,
"sampling/sampling_logp_difference/mean": 0.5080787539482117,
"step": 981
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.2747099846601486,
"epoch": 2.5842105263157897,
"grad_norm": 0.09463394433259964,
"learning_rate": 1e-06,
"loss": 0.0628,
"step": 982
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.29164865612983704,
"epoch": 2.586842105263158,
"grad_norm": 0.016119860112667084,
"learning_rate": 1e-06,
"loss": 0.1221,
"step": 983
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.3027169704437256,
"epoch": 2.5894736842105264,
"grad_norm": 0.01110443938523531,
"learning_rate": 1e-06,
"loss": 0.2168,
"step": 984
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.068359375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14648.0,
"completions/mean_length": 1692.560546875,
"completions/mean_terminated_length": 614.5723266601562,
"completions/min_length": 201.0,
"completions/min_terminated_length": 201.0,
"entropy": 0.30789491534233093,
"epoch": 2.5921052631578947,
"frac_reward_zero_std": 0.1875,
"grad_norm": 100.55760192871094,
"learning_rate": 1e-06,
"loss": 0.196,
"num_tokens": 307458599.0,
"reward": 0.7546229362487793,
"reward_std": 0.2402782291173935,
"rewards/progression_diversity/mean": -0.03575975075364113,
"rewards/progression_diversity/std": 0.12710198760032654,
"rewards/symbolic_reward_accuracy/mean": 0.8203125,
"rewards/symbolic_reward_accuracy/std": 0.38430243730545044,
"rewards/symbolic_reward_partial_score/mean": 0.8974609375,
"rewards/symbolic_reward_partial_score/std": 0.26585423946380615,
"rewards/tag_count_reward/mean": -0.064453125,
"rewards/tag_count_reward/std": 0.24579854309558868,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0941245555877686,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 200.0,
"sampling/sampling_logp_difference/mean": 0.5682839751243591,
"step": 985
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.3354620784521103,
"epoch": 2.594736842105263,
"grad_norm": 0.8639695644378662,
"learning_rate": 1e-06,
"loss": 0.2717,
"step": 986
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.27841542661190033,
"epoch": 2.5973684210526313,
"grad_norm": 0.023379117250442505,
"learning_rate": 1e-06,
"loss": 0.105,
"step": 987
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.2825475037097931,
"epoch": 2.6,
"grad_norm": 0.06907982379198074,
"learning_rate": 1e-06,
"loss": 0.0687,
"step": 988
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.060546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13308.0,
"completions/mean_length": 1544.58203125,
"completions/mean_terminated_length": 588.1954345703125,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"entropy": 0.3159325271844864,
"epoch": 2.6026315789473684,
"frac_reward_zero_std": 0.375,
"grad_norm": 10.981948852539062,
"learning_rate": 1e-06,
"loss": 0.108,
"num_tokens": 308619185.0,
"reward": 0.7951602935791016,
"reward_std": 0.1814492791891098,
"rewards/progression_diversity/mean": -0.03475700318813324,
"rewards/progression_diversity/std": 0.13058389723300934,
"rewards/symbolic_reward_accuracy/mean": 0.876953125,
"rewards/symbolic_reward_accuracy/std": 0.32881227135658264,
"rewards/symbolic_reward_partial_score/mean": 0.9205728769302368,
"rewards/symbolic_reward_partial_score/std": 0.24516622722148895,
"rewards/tag_count_reward/mean": -0.068359375,
"rewards/tag_count_reward/std": 0.25260838866233826,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.087169885635376,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 207.0,
"sampling/sampling_logp_difference/mean": 0.5363781452178955,
"step": 989
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.292761892080307,
"epoch": 2.6052631578947367,
"grad_norm": 0.7690657377243042,
"learning_rate": 1e-06,
"loss": 0.0832,
"step": 990
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.30266323685646057,
"epoch": 2.6078947368421055,
"grad_norm": 0.011311556212604046,
"learning_rate": 1e-06,
"loss": 0.1176,
"step": 991
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.30019472539424896,
"epoch": 2.610526315789474,
"grad_norm": 0.01206042617559433,
"learning_rate": 1e-06,
"loss": 0.1639,
"step": 992
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.064453125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14076.0,
"completions/mean_length": 1671.501953125,
"completions/mean_terminated_length": 657.9060668945312,
"completions/min_length": 217.0,
"completions/min_terminated_length": 217.0,
"entropy": 0.3192198872566223,
"epoch": 2.613157894736842,
"frac_reward_zero_std": 0.25,
"grad_norm": 54.62965393066406,
"learning_rate": 1e-06,
"loss": 0.2534,
"num_tokens": 309873970.0,
"reward": 0.763299822807312,
"reward_std": 0.21089032292366028,
"rewards/progression_diversity/mean": -0.04208873584866524,
"rewards/progression_diversity/std": 0.14815421402454376,
"rewards/symbolic_reward_accuracy/mean": 0.833984375,
"rewards/symbolic_reward_accuracy/std": 0.3724585771560669,
"rewards/symbolic_reward_partial_score/mean": 0.9012044072151184,
"rewards/symbolic_reward_partial_score/std": 0.26881706714630127,
"rewards/tag_count_reward/mean": -0.0703125,
"rewards/tag_count_reward/std": 0.25592297315597534,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1019631624221802,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 211.0,
"sampling/sampling_logp_difference/mean": 0.8787015676498413,
"step": 993
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.3022729456424713,
"epoch": 2.6157894736842104,
"grad_norm": 0.012045558542013168,
"learning_rate": 1e-06,
"loss": 0.1114,
"step": 994
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.298090398311615,
"epoch": 2.6184210526315788,
"grad_norm": 0.007539027836173773,
"learning_rate": 1e-06,
"loss": 0.1346,
"step": 995
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.2837032973766327,
"epoch": 2.6210526315789475,
"grad_norm": 0.5139919519424438,
"learning_rate": 1e-06,
"loss": 0.1695,
"step": 996
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.095703125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13815.0,
"completions/mean_length": 2134.458984375,
"completions/mean_terminated_length": 626.408203125,
"completions/min_length": 225.0,
"completions/min_terminated_length": 225.0,
"entropy": 0.28592349588871,
"epoch": 2.623684210526316,
"frac_reward_zero_std": 0.125,
"grad_norm": 19.418561935424805,
"learning_rate": 1e-06,
"loss": 0.0376,
"num_tokens": 311386141.0,
"reward": 0.7048972845077515,
"reward_std": 0.2682965099811554,
"rewards/progression_diversity/mean": -0.05226554721593857,
"rewards/progression_diversity/std": 0.15609407424926758,
"rewards/symbolic_reward_accuracy/mean": 0.7578125,
"rewards/symbolic_reward_accuracy/std": 0.42882615327835083,
"rewards/symbolic_reward_partial_score/mean": 0.8663736581802368,
"rewards/symbolic_reward_partial_score/std": 0.2934032678604126,
"rewards/tag_count_reward/mean": -0.091796875,
"rewards/tag_count_reward/std": 0.289021372795105,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.119614601135254,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 214.0,
"sampling/sampling_logp_difference/mean": 1.162545919418335,
"step": 997
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.32562409341335297,
"epoch": 2.626315789473684,
"grad_norm": 0.011620646342635155,
"learning_rate": 1e-06,
"loss": 0.2374,
"step": 998
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.3074010759592056,
"epoch": 2.6289473684210525,
"grad_norm": 33.348323822021484,
"learning_rate": 1e-06,
"loss": 0.1771,
"step": 999
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3359375,
"clip_ratio/low_mean": 0.2109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.546875,
"entropy": 0.31934235990047455,
"epoch": 2.6315789473684212,
"grad_norm": 0.009514860808849335,
"learning_rate": 1e-06,
"loss": 0.315,
"step": 1000
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.107421875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13780.0,
"completions/mean_length": 2376.53125,
"completions/mean_terminated_length": 690.7308349609375,
"completions/min_length": 241.0,
"completions/min_terminated_length": 241.0,
"entropy": 0.30572909116744995,
"epoch": 2.6342105263157896,
"frac_reward_zero_std": 0.1875,
"grad_norm": 94.71639251708984,
"learning_rate": 1e-06,
"loss": 0.2104,
"num_tokens": 312988045.0,
"reward": 0.7043373584747314,
"reward_std": 0.23734444379806519,
"rewards/progression_diversity/mean": -0.05454763025045395,
"rewards/progression_diversity/std": 0.15527451038360596,
"rewards/symbolic_reward_accuracy/mean": 0.76953125,
"rewards/symbolic_reward_accuracy/std": 0.42154473066329956,
"rewards/symbolic_reward_partial_score/mean": 0.8483072519302368,
"rewards/symbolic_reward_partial_score/std": 0.3243545591831207,
"rewards/tag_count_reward/mean": -0.11328125,
"rewards/tag_count_reward/std": 0.3172462284564972,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1103978157043457,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 215.0,
"sampling/sampling_logp_difference/mean": 0.8874210119247437,
"step": 1001
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.29766781628131866,
"epoch": 2.636842105263158,
"grad_norm": 1.0701483488082886,
"learning_rate": 1e-06,
"loss": 0.1461,
"step": 1002
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.29889827966690063,
"epoch": 2.639473684210526,
"grad_norm": 0.014587471261620522,
"learning_rate": 1e-06,
"loss": 0.1481,
"step": 1003
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.33510279655456543,
"epoch": 2.6421052631578945,
"grad_norm": 0.6961768269538879,
"learning_rate": 1e-06,
"loss": 0.2699,
"step": 1004
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13484.0,
"completions/mean_length": 1625.13671875,
"completions/mean_terminated_length": 575.3430786132812,
"completions/min_length": 253.0,
"completions/min_terminated_length": 253.0,
"entropy": 0.29443803429603577,
"epoch": 2.6447368421052633,
"frac_reward_zero_std": 0.21875,
"grad_norm": 39.419071197509766,
"learning_rate": 1e-06,
"loss": 0.1312,
"num_tokens": 314233267.0,
"reward": 0.7752651572227478,
"reward_std": 0.22436900436878204,
"rewards/progression_diversity/mean": -0.0369657427072525,
"rewards/progression_diversity/std": 0.13464143872261047,
"rewards/symbolic_reward_accuracy/mean": 0.84765625,
"rewards/symbolic_reward_accuracy/std": 0.35970520973205566,
"rewards/symbolic_reward_partial_score/mean": 0.91162109375,
"rewards/symbolic_reward_partial_score/std": 0.2517907917499542,
"rewards/tag_count_reward/mean": -0.064453125,
"rewards/tag_count_reward/std": 0.24579854309558868,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1153823137283325,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 214.0,
"sampling/sampling_logp_difference/mean": 0.7468029260635376,
"step": 1005
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.31393955647945404,
"epoch": 2.6473684210526316,
"grad_norm": 49.80437469482422,
"learning_rate": 1e-06,
"loss": 0.2599,
"step": 1006
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.2889316976070404,
"epoch": 2.65,
"grad_norm": 2.7875256538391113,
"learning_rate": 1e-06,
"loss": 0.0751,
"step": 1007
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3671875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.453125,
"entropy": 0.305808961391449,
"epoch": 2.6526315789473687,
"grad_norm": 3.0765891075134277,
"learning_rate": 1e-06,
"loss": 0.2263,
"step": 1008
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14044.0,
"completions/mean_length": 1598.720703125,
"completions/mean_terminated_length": 678.4751586914062,
"completions/min_length": 230.0,
"completions/min_terminated_length": 230.0,
"entropy": 0.30450139939785004,
"epoch": 2.655263157894737,
"frac_reward_zero_std": 0.125,
"grad_norm": 31.50531578063965,
"learning_rate": 1e-06,
"loss": 0.1628,
"num_tokens": 315446116.0,
"reward": 0.7807782292366028,
"reward_std": 0.25463420152664185,
"rewards/progression_diversity/mean": -0.037412114441394806,
"rewards/progression_diversity/std": 0.1364642083644867,
"rewards/symbolic_reward_accuracy/mean": 0.86328125,
"rewards/symbolic_reward_accuracy/std": 0.3438861668109894,
"rewards/symbolic_reward_partial_score/mean": 0.8994140625,
"rewards/symbolic_reward_partial_score/std": 0.2765096426010132,
"rewards/tag_count_reward/mean": -0.06640625,
"rewards/tag_count_reward/std": 0.2492343932390213,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1178048849105835,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 216.0,
"sampling/sampling_logp_difference/mean": 0.7162982225418091,
"step": 1009
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.28954601287841797,
"epoch": 2.6578947368421053,
"grad_norm": 10.927779197692871,
"learning_rate": 1e-06,
"loss": 0.1193,
"step": 1010
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.33044156432151794,
"epoch": 2.6605263157894736,
"grad_norm": 0.021338341757655144,
"learning_rate": 1e-06,
"loss": 0.3381,
"step": 1011
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.287820965051651,
"epoch": 2.663157894736842,
"grad_norm": 0.009080777876079082,
"learning_rate": 1e-06,
"loss": 0.1125,
"step": 1012
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.072265625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13928.0,
"completions/mean_length": 1759.529296875,
"completions/mean_terminated_length": 620.3599853515625,
"completions/min_length": 202.0,
"completions/min_terminated_length": 202.0,
"entropy": 0.3139393925666809,
"epoch": 2.6657894736842103,
"frac_reward_zero_std": 0.28125,
"grad_norm": 21.068523406982422,
"learning_rate": 1e-06,
"loss": 0.1538,
"num_tokens": 316735123.0,
"reward": 0.7530990839004517,
"reward_std": 0.20721742510795593,
"rewards/progression_diversity/mean": -0.036777839064598083,
"rewards/progression_diversity/std": 0.12973876297473907,
"rewards/symbolic_reward_accuracy/mean": 0.82421875,
"rewards/symbolic_reward_accuracy/std": 0.3810062110424042,
"rewards/symbolic_reward_partial_score/mean": 0.88720703125,
"rewards/symbolic_reward_partial_score/std": 0.28543245792388916,
"rewards/tag_count_reward/mean": -0.072265625,
"rewards/tag_count_reward/std": 0.2591804563999176,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1145870685577393,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 218.0,
"sampling/sampling_logp_difference/mean": 0.6381403803825378,
"step": 1013
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.2958460748195648,
"epoch": 2.668421052631579,
"grad_norm": 0.33650216460227966,
"learning_rate": 1e-06,
"loss": 0.0614,
"step": 1014
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.3110879957675934,
"epoch": 2.6710526315789473,
"grad_norm": 0.006965796463191509,
"learning_rate": 1e-06,
"loss": 0.1327,
"step": 1015
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.31926093995571136,
"epoch": 2.6736842105263157,
"grad_norm": 0.009219714440405369,
"learning_rate": 1e-06,
"loss": 0.17,
"step": 1016
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15009.0,
"completions/mean_length": 1719.177734375,
"completions/mean_terminated_length": 676.0731811523438,
"completions/min_length": 227.0,
"completions/min_terminated_length": 227.0,
"entropy": 0.3213798254728317,
"epoch": 2.6763157894736844,
"frac_reward_zero_std": 0.09375,
"grad_norm": 36.93714904785156,
"learning_rate": 1e-06,
"loss": 0.1665,
"num_tokens": 317986126.0,
"reward": 0.7775270938873291,
"reward_std": 0.24769052863121033,
"rewards/progression_diversity/mean": -0.03537972271442413,
"rewards/progression_diversity/std": 0.1272072046995163,
"rewards/symbolic_reward_accuracy/mean": 0.853515625,
"rewards/symbolic_reward_accuracy/std": 0.35393697023391724,
"rewards/symbolic_reward_partial_score/mean": 0.90673828125,
"rewards/symbolic_reward_partial_score/std": 0.263979971408844,
"rewards/tag_count_reward/mean": -0.0625,
"rewards/tag_count_reward/std": 0.2422981858253479,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1412655115127563,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 219.0,
"sampling/sampling_logp_difference/mean": 0.8786369562149048,
"step": 1017
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3359375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.3090389519929886,
"epoch": 2.6789473684210527,
"grad_norm": 69.12992095947266,
"learning_rate": 1e-06,
"loss": 0.1427,
"step": 1018
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.333890438079834,
"epoch": 2.681578947368421,
"grad_norm": 0.026694627478718758,
"learning_rate": 1e-06,
"loss": 0.2929,
"step": 1019
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.316163033246994,
"epoch": 2.6842105263157894,
"grad_norm": 0.014393487945199013,
"learning_rate": 1e-06,
"loss": 0.1923,
"step": 1020
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15286.0,
"completions/mean_length": 3114.5078125,
"completions/mean_terminated_length": 729.668212890625,
"completions/min_length": 203.0,
"completions/min_terminated_length": 203.0,
"entropy": 0.36883601546287537,
"epoch": 2.6868421052631577,
"frac_reward_zero_std": 0.125,
"grad_norm": 56.34549331665039,
"learning_rate": 1e-06,
"loss": 0.3005,
"num_tokens": 320001746.0,
"reward": 0.7100275158882141,
"reward_std": 0.24902528524398804,
"rewards/progression_diversity/mean": -0.07147189974784851,
"rewards/progression_diversity/std": 0.16751477122306824,
"rewards/symbolic_reward_accuracy/mean": 0.7890625,
"rewards/symbolic_reward_accuracy/std": 0.4083731174468994,
"rewards/symbolic_reward_partial_score/mean": 0.84375,
"rewards/symbolic_reward_partial_score/std": 0.33665984869003296,
"rewards/tag_count_reward/mean": -0.158203125,
"rewards/tag_count_reward/std": 0.36528825759887695,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1642587184906006,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 220.0,
"sampling/sampling_logp_difference/mean": 1.0265074968338013,
"step": 1021
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.33186276257038116,
"epoch": 2.6894736842105265,
"grad_norm": 2.4869983196258545,
"learning_rate": 1e-06,
"loss": 0.2048,
"step": 1022
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.3606336712837219,
"epoch": 2.692105263157895,
"grad_norm": 1.9981337785720825,
"learning_rate": 1e-06,
"loss": 0.2636,
"step": 1023
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4765625,
"entropy": 0.35093724727630615,
"epoch": 2.694736842105263,
"grad_norm": 1.4287171363830566,
"learning_rate": 1e-06,
"loss": 0.2628,
"step": 1024
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14680.0,
"completions/mean_length": 2566.037109375,
"completions/mean_terminated_length": 731.7942504882812,
"completions/min_length": 206.0,
"completions/min_terminated_length": 206.0,
"entropy": 0.3709093779325485,
"epoch": 2.6973684210526314,
"frac_reward_zero_std": 0.09375,
"grad_norm": 38.17184829711914,
"learning_rate": 1e-06,
"loss": 0.4175,
"num_tokens": 321699173.0,
"reward": 0.709370493888855,
"reward_std": 0.2614933252334595,
"rewards/progression_diversity/mean": -0.059050630778074265,
"rewards/progression_diversity/std": 0.1578987091779709,
"rewards/symbolic_reward_accuracy/mean": 0.783203125,
"rewards/symbolic_reward_accuracy/std": 0.4124660789966583,
"rewards/symbolic_reward_partial_score/mean": 0.83984375,
"rewards/symbolic_reward_partial_score/std": 0.3304028809070587,
"rewards/tag_count_reward/mean": -0.119140625,
"rewards/tag_count_reward/std": 0.32427072525024414,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1661169528961182,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 220.0,
"sampling/sampling_logp_difference/mean": 1.1539242267608643,
"step": 1025
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.40625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5,
"entropy": 0.3150624781847,
"epoch": 2.7,
"grad_norm": 1.1932648420333862,
"learning_rate": 1e-06,
"loss": 0.1208,
"step": 1026
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.515625,
"entropy": 0.35171347856521606,
"epoch": 2.7026315789473685,
"grad_norm": 0.0064244456589221954,
"learning_rate": 1e-06,
"loss": 0.3091,
"step": 1027
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3984375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.53125,
"entropy": 0.32932211458683014,
"epoch": 2.705263157894737,
"grad_norm": 0.20103150606155396,
"learning_rate": 1e-06,
"loss": 0.1512,
"step": 1028
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.099609375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14449.0,
"completions/mean_length": 2274.8984375,
"completions/mean_terminated_length": 714.021728515625,
"completions/min_length": 237.0,
"completions/min_terminated_length": 237.0,
"entropy": 0.3005029112100601,
"epoch": 2.707894736842105,
"frac_reward_zero_std": 0.1875,
"grad_norm": 63.158145904541016,
"learning_rate": 1e-06,
"loss": 0.1651,
"num_tokens": 323267889.0,
"reward": 0.7657883167266846,
"reward_std": 0.2607175409793854,
"rewards/progression_diversity/mean": -0.05203522741794586,
"rewards/progression_diversity/std": 0.15054935216903687,
"rewards/symbolic_reward_accuracy/mean": 0.849609375,
"rewards/symbolic_reward_accuracy/std": 0.35780346393585205,
"rewards/symbolic_reward_partial_score/mean": 0.8870442509651184,
"rewards/symbolic_reward_partial_score/std": 0.29508423805236816,
"rewards/tag_count_reward/mean": -0.095703125,
"rewards/tag_count_reward/std": 0.2944713830947876,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.148714303970337,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 221.0,
"sampling/sampling_logp_difference/mean": 1.0465000867843628,
"step": 1029
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.3231324404478073,
"epoch": 2.7105263157894735,
"grad_norm": 0.05354348570108414,
"learning_rate": 1e-06,
"loss": 0.175,
"step": 1030
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.3530917465686798,
"epoch": 2.713157894736842,
"grad_norm": 0.014254847541451454,
"learning_rate": 1e-06,
"loss": 0.2848,
"step": 1031
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.3636767566204071,
"epoch": 2.7157894736842105,
"grad_norm": 0.34099018573760986,
"learning_rate": 1e-06,
"loss": 0.3258,
"step": 1032
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1640625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15208.0,
"completions/mean_length": 3406.103515625,
"completions/mean_terminated_length": 859.0396728515625,
"completions/min_length": 239.0,
"completions/min_terminated_length": 239.0,
"entropy": 0.3606864959001541,
"epoch": 2.718421052631579,
"frac_reward_zero_std": 0.0,
"grad_norm": 110.6368637084961,
"learning_rate": 1e-06,
"loss": 0.2805,
"num_tokens": 325423462.0,
"reward": 0.6178954243659973,
"reward_std": 0.3361561894416809,
"rewards/progression_diversity/mean": -0.08546093106269836,
"rewards/progression_diversity/std": 0.18625648319721222,
"rewards/symbolic_reward_accuracy/mean": 0.666015625,
"rewards/symbolic_reward_accuracy/std": 0.47209542989730835,
"rewards/symbolic_reward_partial_score/mean": 0.7877604365348816,
"rewards/symbolic_reward_partial_score/std": 0.3652539551258087,
"rewards/tag_count_reward/mean": -0.171875,
"rewards/tag_count_reward/std": 0.3776407241821289,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1710354089736938,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 224.0,
"sampling/sampling_logp_difference/mean": 1.457003116607666,
"step": 1033
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.453125,
"entropy": 0.36586907505989075,
"epoch": 2.7210526315789476,
"grad_norm": 47.91674041748047,
"learning_rate": 1e-06,
"loss": 0.3584,
"step": 1034
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.2421875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.35853834450244904,
"epoch": 2.723684210526316,
"grad_norm": 1.4703402519226074,
"learning_rate": 1e-06,
"loss": 0.2924,
"step": 1035
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.3385375142097473,
"epoch": 2.7263157894736842,
"grad_norm": 10.245598793029785,
"learning_rate": 1e-06,
"loss": 0.1748,
"step": 1036
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10898.0,
"completions/mean_length": 2200.62109375,
"completions/mean_terminated_length": 597.2825927734375,
"completions/min_length": 232.0,
"completions/min_terminated_length": 232.0,
"entropy": 0.30844029784202576,
"epoch": 2.7289473684210526,
"frac_reward_zero_std": 0.21875,
"grad_norm": 21.34285545349121,
"learning_rate": 1e-06,
"loss": 0.0902,
"num_tokens": 326975684.0,
"reward": 0.7785071730613708,
"reward_std": 0.2188257873058319,
"rewards/progression_diversity/mean": -0.05456160753965378,
"rewards/progression_diversity/std": 0.16060733795166016,
"rewards/symbolic_reward_accuracy/mean": 0.861328125,
"rewards/symbolic_reward_accuracy/std": 0.34594178199768066,
"rewards/symbolic_reward_partial_score/mean": 0.9073892831802368,
"rewards/symbolic_reward_partial_score/std": 0.2701605260372162,
"rewards/tag_count_reward/mean": -0.099609375,
"rewards/tag_count_reward/std": 0.29977133870124817,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1437357664108276,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 223.0,
"sampling/sampling_logp_difference/mean": 1.1044782400131226,
"step": 1037
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.35305924713611603,
"epoch": 2.731578947368421,
"grad_norm": 2.7072434425354004,
"learning_rate": 1e-06,
"loss": 0.3355,
"step": 1038
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.31338661909103394,
"epoch": 2.734210526315789,
"grad_norm": 16.57544708251953,
"learning_rate": 1e-06,
"loss": 0.1466,
"step": 1039
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.3223544955253601,
"epoch": 2.736842105263158,
"grad_norm": 0.008024279028177261,
"learning_rate": 1e-06,
"loss": 0.2434,
"step": 1040
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11328125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14168.0,
"completions/mean_length": 2421.57421875,
"completions/mean_terminated_length": 637.8281860351562,
"completions/min_length": 198.0,
"completions/min_terminated_length": 198.0,
"entropy": 0.3547728508710861,
"epoch": 2.7394736842105263,
"frac_reward_zero_std": 0.09375,
"grad_norm": 86.12596130371094,
"learning_rate": 1e-06,
"loss": 0.3437,
"num_tokens": 328613322.0,
"reward": 0.7703942060470581,
"reward_std": 0.2505676746368408,
"rewards/progression_diversity/mean": -0.06019480898976326,
"rewards/progression_diversity/std": 0.1668085753917694,
"rewards/symbolic_reward_accuracy/mean": 0.853515625,
"rewards/symbolic_reward_accuracy/std": 0.35393697023391724,
"rewards/symbolic_reward_partial_score/mean": 0.9000651240348816,
"rewards/symbolic_reward_partial_score/std": 0.27447789907455444,
"rewards/tag_count_reward/mean": -0.111328125,
"rewards/tag_count_reward/std": 0.31484565138816833,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.165305256843567,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 227.0,
"sampling/sampling_logp_difference/mean": 1.1906356811523438,
"step": 1041
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.4296875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4765625,
"entropy": 0.3167977035045624,
"epoch": 2.7421052631578946,
"grad_norm": 6.371710300445557,
"learning_rate": 1e-06,
"loss": 0.1303,
"step": 1042
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3984375,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5390625,
"entropy": 0.3315960764884949,
"epoch": 2.7447368421052634,
"grad_norm": 22.913915634155273,
"learning_rate": 1e-06,
"loss": 0.2824,
"step": 1043
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.453125,
"entropy": 0.3117201626300812,
"epoch": 2.7473684210526317,
"grad_norm": 5.115964412689209,
"learning_rate": 1e-06,
"loss": 0.253,
"step": 1044
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.099609375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13730.0,
"completions/mean_length": 2225.83984375,
"completions/mean_terminated_length": 659.5358276367188,
"completions/min_length": 178.0,
"completions/min_terminated_length": 178.0,
"entropy": 0.31773585081100464,
"epoch": 2.75,
"frac_reward_zero_std": 0.1875,
"grad_norm": 119.56202697753906,
"learning_rate": 1e-06,
"loss": 0.1282,
"num_tokens": 330152088.0,
"reward": 0.742752194404602,
"reward_std": 0.24866357445716858,
"rewards/progression_diversity/mean": -0.055839426815509796,
"rewards/progression_diversity/std": 0.16208583116531372,
"rewards/symbolic_reward_accuracy/mean": 0.814453125,
"rewards/symbolic_reward_accuracy/std": 0.38912075757980347,
"rewards/symbolic_reward_partial_score/mean": 0.88134765625,
"rewards/symbolic_reward_partial_score/std": 0.2882753908634186,
"rewards/tag_count_reward/mean": -0.09765625,
"rewards/tag_count_reward/std": 0.29713961482048035,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1512091159820557,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 230.999267578125,
"sampling/sampling_logp_difference/mean": 1.0916484594345093,
"step": 1045
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.3315199911594391,
"epoch": 2.7526315789473683,
"grad_norm": 85.15950012207031,
"learning_rate": 1e-06,
"loss": 0.2759,
"step": 1046
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.3466692268848419,
"epoch": 2.7552631578947366,
"grad_norm": 0.012678657658398151,
"learning_rate": 1e-06,
"loss": 0.2945,
"step": 1047
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.31701667606830597,
"epoch": 2.7578947368421054,
"grad_norm": 0.01987772062420845,
"learning_rate": 1e-06,
"loss": 0.1537,
"step": 1048
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13047.0,
"completions/mean_length": 2842.43359375,
"completions/mean_terminated_length": 626.5408935546875,
"completions/min_length": 239.0,
"completions/min_terminated_length": 239.0,
"entropy": 0.32595211267471313,
"epoch": 2.7605263157894737,
"frac_reward_zero_std": 0.125,
"grad_norm": 42.63386535644531,
"learning_rate": 1e-06,
"loss": 0.1182,
"num_tokens": 332023574.0,
"reward": 0.6774349212646484,
"reward_std": 0.2632233202457428,
"rewards/progression_diversity/mean": -0.06901118159294128,
"rewards/progression_diversity/std": 0.17166905105113983,
"rewards/symbolic_reward_accuracy/mean": 0.734375,
"rewards/symbolic_reward_accuracy/std": 0.44209739565849304,
"rewards/symbolic_reward_partial_score/mean": 0.8352864980697632,
"rewards/symbolic_reward_partial_score/std": 0.32469478249549866,
"rewards/tag_count_reward/mean": -0.130859375,
"rewards/tag_count_reward/std": 0.33757632970809937,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.155900239944458,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 234.0,
"sampling/sampling_logp_difference/mean": 1.1260725259780884,
"step": 1049
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.46875,
"entropy": 0.3452259302139282,
"epoch": 2.763157894736842,
"grad_norm": 5.842362403869629,
"learning_rate": 1e-06,
"loss": 0.1744,
"step": 1050
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3828125,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.515625,
"entropy": 0.34844136238098145,
"epoch": 2.765789473684211,
"grad_norm": 14.326484680175781,
"learning_rate": 1e-06,
"loss": 0.2425,
"step": 1051
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3984375,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5390625,
"entropy": 0.35987676680088043,
"epoch": 2.768421052631579,
"grad_norm": 30.41329002380371,
"learning_rate": 1e-06,
"loss": 0.2773,
"step": 1052
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12739.0,
"completions/mean_length": 2201.59375,
"completions/mean_terminated_length": 666.7012939453125,
"completions/min_length": 185.0,
"completions/min_terminated_length": 185.0,
"entropy": 0.32392917573451996,
"epoch": 2.7710526315789474,
"frac_reward_zero_std": 0.25,
"grad_norm": 95.98092651367188,
"learning_rate": 1e-06,
"loss": 0.1847,
"num_tokens": 333554758.0,
"reward": 0.7248282432556152,
"reward_std": 0.2060919851064682,
"rewards/progression_diversity/mean": -0.04647231847047806,
"rewards/progression_diversity/std": 0.1398431360721588,
"rewards/symbolic_reward_accuracy/mean": 0.791015625,
"rewards/symbolic_reward_accuracy/std": 0.40698084235191345,
"rewards/symbolic_reward_partial_score/mean": 0.8675130009651184,
"rewards/symbolic_reward_partial_score/std": 0.2977105677127838,
"rewards/tag_count_reward/mean": -0.095703125,
"rewards/tag_count_reward/std": 0.2944713830947876,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1349936723709106,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 236.0,
"sampling/sampling_logp_difference/mean": 0.8933988809585571,
"step": 1053
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.33513136208057404,
"epoch": 2.7736842105263158,
"grad_norm": 0.05125413089990616,
"learning_rate": 1e-06,
"loss": 0.139,
"step": 1054
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.32300393283367157,
"epoch": 2.776315789473684,
"grad_norm": 2.2907490730285645,
"learning_rate": 1e-06,
"loss": 0.1541,
"step": 1055
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.32895462214946747,
"epoch": 2.7789473684210524,
"grad_norm": 0.22027532756328583,
"learning_rate": 1e-06,
"loss": 0.1606,
"step": 1056
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.083984375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12516.0,
"completions/mean_length": 1867.125,
"completions/mean_terminated_length": 536.1535034179688,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"entropy": 0.32684555649757385,
"epoch": 2.781578947368421,
"frac_reward_zero_std": 0.3125,
"grad_norm": 27.86194610595703,
"learning_rate": 1e-06,
"loss": 0.1246,
"num_tokens": 334916038.0,
"reward": 0.7695657014846802,
"reward_std": 0.18517985939979553,
"rewards/progression_diversity/mean": -0.040504228323698044,
"rewards/progression_diversity/std": 0.13616561889648438,
"rewards/symbolic_reward_accuracy/mean": 0.845703125,
"rewards/symbolic_reward_accuracy/std": 0.36158639192581177,
"rewards/symbolic_reward_partial_score/mean": 0.89990234375,
"rewards/symbolic_reward_partial_score/std": 0.2672683298587799,
"rewards/tag_count_reward/mean": -0.07421875,
"rewards/tag_count_reward/std": 0.2623828947544098,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1254425048828125,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 236.0,
"sampling/sampling_logp_difference/mean": 0.741882860660553,
"step": 1057
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.3313567191362381,
"epoch": 2.7842105263157895,
"grad_norm": 6.761070728302002,
"learning_rate": 1e-06,
"loss": 0.2295,
"step": 1058
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.3324154317378998,
"epoch": 2.786842105263158,
"grad_norm": 0.011156097054481506,
"learning_rate": 1e-06,
"loss": 0.0708,
"step": 1059
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.33027806878089905,
"epoch": 2.7894736842105265,
"grad_norm": 0.16176286339759827,
"learning_rate": 1e-06,
"loss": 0.1187,
"step": 1060
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15293.0,
"completions/mean_length": 2273.78125,
"completions/mean_terminated_length": 678.7130126953125,
"completions/min_length": 212.0,
"completions/min_terminated_length": 212.0,
"entropy": 0.3395443707704544,
"epoch": 2.792105263157895,
"frac_reward_zero_std": 0.21875,
"grad_norm": 84.5785140991211,
"learning_rate": 1e-06,
"loss": 0.3383,
"num_tokens": 336512054.0,
"reward": 0.7558015584945679,
"reward_std": 0.23675483465194702,
"rewards/progression_diversity/mean": -0.05461447685956955,
"rewards/progression_diversity/std": 0.15894843637943268,
"rewards/symbolic_reward_accuracy/mean": 0.833984375,
"rewards/symbolic_reward_accuracy/std": 0.3724585771560669,
"rewards/symbolic_reward_partial_score/mean": 0.8889973759651184,
"rewards/symbolic_reward_partial_score/std": 0.28549298644065857,
"rewards/tag_count_reward/mean": -0.107421875,
"rewards/tag_count_reward/std": 0.30995169281959534,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1387587785720825,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 236.0,
"sampling/sampling_logp_difference/mean": 0.9233871698379517,
"step": 1061
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.3283067047595978,
"epoch": 2.794736842105263,
"grad_norm": 13.06532096862793,
"learning_rate": 1e-06,
"loss": 0.2576,
"step": 1062
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.3241392523050308,
"epoch": 2.7973684210526315,
"grad_norm": 1.6975409984588623,
"learning_rate": 1e-06,
"loss": 0.0366,
"step": 1063
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.31890368461608887,
"epoch": 2.8,
"grad_norm": 10.018250465393066,
"learning_rate": 1e-06,
"loss": 0.1212,
"step": 1064
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14169.0,
"completions/mean_length": 2099.845703125,
"completions/mean_terminated_length": 622.174560546875,
"completions/min_length": 173.0,
"completions/min_terminated_length": 173.0,
"entropy": 0.3201150894165039,
"epoch": 2.8026315789473686,
"frac_reward_zero_std": 0.125,
"grad_norm": 316.5431823730469,
"learning_rate": 1e-06,
"loss": 0.188,
"num_tokens": 338001831.0,
"reward": 0.7660241723060608,
"reward_std": 0.2371438443660736,
"rewards/progression_diversity/mean": -0.052859190851449966,
"rewards/progression_diversity/std": 0.16034673154354095,
"rewards/symbolic_reward_accuracy/mean": 0.84765625,
"rewards/symbolic_reward_accuracy/std": 0.35970520973205566,
"rewards/symbolic_reward_partial_score/mean": 0.8904622793197632,
"rewards/symbolic_reward_partial_score/std": 0.28877806663513184,
"rewards/tag_count_reward/mean": -0.091796875,
"rewards/tag_count_reward/std": 0.289021372795105,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1355504989624023,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 236.0,
"sampling/sampling_logp_difference/mean": 0.9679932594299316,
"step": 1065
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.32754945755004883,
"epoch": 2.805263157894737,
"grad_norm": 55.01374053955078,
"learning_rate": 1e-06,
"loss": 0.155,
"step": 1066
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.33194398880004883,
"epoch": 2.807894736842105,
"grad_norm": 0.01747787557542324,
"learning_rate": 1e-06,
"loss": 0.2416,
"step": 1067
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4609375,
"entropy": 0.33147796988487244,
"epoch": 2.8105263157894735,
"grad_norm": 0.019125865772366524,
"learning_rate": 1e-06,
"loss": 0.2642,
"step": 1068
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14000.0,
"completions/mean_length": 2323.65625,
"completions/mean_terminated_length": 596.9473876953125,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"entropy": 0.33723948895931244,
"epoch": 2.8131578947368423,
"frac_reward_zero_std": 0.1875,
"grad_norm": 55.272857666015625,
"learning_rate": 1e-06,
"loss": 0.2617,
"num_tokens": 339580151.0,
"reward": 0.7574684619903564,
"reward_std": 0.24873128533363342,
"rewards/progression_diversity/mean": -0.06370329856872559,
"rewards/progression_diversity/std": 0.176786869764328,
"rewards/symbolic_reward_accuracy/mean": 0.83984375,
"rewards/symbolic_reward_accuracy/std": 0.3671095669269562,
"rewards/symbolic_reward_partial_score/mean": 0.8811849355697632,
"rewards/symbolic_reward_partial_score/std": 0.2997874319553375,
"rewards/tag_count_reward/mean": -0.1015625,
"rewards/tag_count_reward/std": 0.30236753821372986,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1518162488937378,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 235.0,
"sampling/sampling_logp_difference/mean": 1.1923803091049194,
"step": 1069
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.359375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.3266005963087082,
"epoch": 2.8157894736842106,
"grad_norm": 14.835233688354492,
"learning_rate": 1e-06,
"loss": 0.1812,
"step": 1070
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.3344879746437073,
"epoch": 2.818421052631579,
"grad_norm": 20.65220832824707,
"learning_rate": 1e-06,
"loss": 0.2772,
"step": 1071
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.32967130839824677,
"epoch": 2.8210526315789473,
"grad_norm": 15.066210746765137,
"learning_rate": 1e-06,
"loss": 0.2274,
"step": 1072
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13483.0,
"completions/mean_length": 2695.2265625,
"completions/mean_terminated_length": 739.6875610351562,
"completions/min_length": 190.0,
"completions/min_terminated_length": 190.0,
"entropy": 0.3279756009578705,
"epoch": 2.8236842105263156,
"frac_reward_zero_std": 0.125,
"grad_norm": 82.50452423095703,
"learning_rate": 1e-06,
"loss": 0.1288,
"num_tokens": 341365259.0,
"reward": 0.6973080039024353,
"reward_std": 0.2565169632434845,
"rewards/progression_diversity/mean": -0.06900518387556076,
"rewards/progression_diversity/std": 0.174638569355011,
"rewards/symbolic_reward_accuracy/mean": 0.763671875,
"rewards/symbolic_reward_accuracy/std": 0.42524150013923645,
"rewards/symbolic_reward_partial_score/mean": 0.8409830331802368,
"rewards/symbolic_reward_partial_score/std": 0.327880322933197,
"rewards/tag_count_reward/mean": -0.125,
"rewards/tag_count_reward/std": 0.3310423493385315,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.146012783050537,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 238.0,
"sampling/sampling_logp_difference/mean": 1.2247239351272583,
"step": 1073
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4765625,
"entropy": 0.3268892467021942,
"epoch": 2.8263157894736843,
"grad_norm": 0.03915620595216751,
"learning_rate": 1e-06,
"loss": 0.3393,
"step": 1074
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.3162257820367813,
"epoch": 2.8289473684210527,
"grad_norm": 1.8628556728363037,
"learning_rate": 1e-06,
"loss": 0.1444,
"step": 1075
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.32084639370441437,
"epoch": 2.831578947368421,
"grad_norm": 0.034546032547950745,
"learning_rate": 1e-06,
"loss": 0.2549,
"step": 1076
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.119140625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13901.0,
"completions/mean_length": 2555.38671875,
"completions/mean_terminated_length": 684.997802734375,
"completions/min_length": 178.0,
"completions/min_terminated_length": 178.0,
"entropy": 0.31973057985305786,
"epoch": 2.8342105263157897,
"frac_reward_zero_std": 0.09375,
"grad_norm": 80.3057861328125,
"learning_rate": 1e-06,
"loss": 0.1383,
"num_tokens": 343049393.0,
"reward": 0.7412656545639038,
"reward_std": 0.25144657492637634,
"rewards/progression_diversity/mean": -0.06289247423410416,
"rewards/progression_diversity/std": 0.16770640015602112,
"rewards/symbolic_reward_accuracy/mean": 0.8203125,
"rewards/symbolic_reward_accuracy/std": 0.38430243730545044,
"rewards/symbolic_reward_partial_score/mean": 0.8714193105697632,
"rewards/symbolic_reward_partial_score/std": 0.30579882860183716,
"rewards/tag_count_reward/mean": -0.1171875,
"rewards/tag_count_reward/std": 0.32195815443992615,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1399236917495728,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 239.99998474121094,
"sampling/sampling_logp_difference/mean": 1.2581195831298828,
"step": 1077
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3828125,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5234375,
"entropy": 0.31444528698921204,
"epoch": 2.836842105263158,
"grad_norm": 55.245845794677734,
"learning_rate": 1e-06,
"loss": 0.2088,
"step": 1078
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.4921875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5625,
"entropy": 0.32562339305877686,
"epoch": 2.8394736842105264,
"grad_norm": 34.23236846923828,
"learning_rate": 1e-06,
"loss": 0.2752,
"step": 1079
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.40625,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5078125,
"entropy": 0.32775624096393585,
"epoch": 2.8421052631578947,
"grad_norm": 23.81574821472168,
"learning_rate": 1e-06,
"loss": 0.2996,
"step": 1080
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9691.0,
"completions/mean_length": 2100.416015625,
"completions/mean_terminated_length": 554.5736083984375,
"completions/min_length": 192.0,
"completions/min_terminated_length": 192.0,
"entropy": 0.3156034052371979,
"epoch": 2.844736842105263,
"frac_reward_zero_std": 0.21875,
"grad_norm": 43.40895462036133,
"learning_rate": 1e-06,
"loss": 0.1877,
"num_tokens": 344549286.0,
"reward": 0.7650686502456665,
"reward_std": 0.24320955574512482,
"rewards/progression_diversity/mean": -0.05075865983963013,
"rewards/progression_diversity/std": 0.15617826581001282,
"rewards/symbolic_reward_accuracy/mean": 0.845703125,
"rewards/symbolic_reward_accuracy/std": 0.36158639192581177,
"rewards/symbolic_reward_partial_score/mean": 0.8904622793197632,
"rewards/symbolic_reward_partial_score/std": 0.2910281717777252,
"rewards/tag_count_reward/mean": -0.08984375,
"rewards/tag_count_reward/std": 0.2862374484539032,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1255333423614502,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 240.0,
"sampling/sampling_logp_difference/mean": 1.064498782157898,
"step": 1081
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.30958323180675507,
"epoch": 2.8473684210526313,
"grad_norm": 9.998337745666504,
"learning_rate": 1e-06,
"loss": 0.1545,
"step": 1082
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.3141885995864868,
"epoch": 2.85,
"grad_norm": 0.007370346691459417,
"learning_rate": 1e-06,
"loss": 0.2394,
"step": 1083
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.3183276653289795,
"epoch": 2.8526315789473684,
"grad_norm": 0.008199164643883705,
"learning_rate": 1e-06,
"loss": 0.2117,
"step": 1084
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11328125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13934.0,
"completions/mean_length": 2409.056640625,
"completions/mean_terminated_length": 623.71142578125,
"completions/min_length": 133.0,
"completions/min_terminated_length": 133.0,
"entropy": 0.323482483625412,
"epoch": 2.8552631578947367,
"frac_reward_zero_std": 0.1875,
"grad_norm": 53.07914352416992,
"learning_rate": 1e-06,
"loss": 0.1864,
"num_tokens": 346174179.0,
"reward": 0.7465149164199829,
"reward_std": 0.27112072706222534,
"rewards/progression_diversity/mean": -0.06531209498643875,
"rewards/progression_diversity/std": 0.1783425658941269,
"rewards/symbolic_reward_accuracy/mean": 0.82421875,
"rewards/symbolic_reward_accuracy/std": 0.3810062110424042,
"rewards/symbolic_reward_partial_score/mean": 0.8785806894302368,
"rewards/symbolic_reward_partial_score/std": 0.3004186153411865,
"rewards/tag_count_reward/mean": -0.109375,
"rewards/tag_count_reward/std": 0.31241437792778015,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1350785493850708,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 240.0,
"sampling/sampling_logp_difference/mean": 1.2513222694396973,
"step": 1085
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.32216334342956543,
"epoch": 2.8578947368421055,
"grad_norm": 3.4325926303863525,
"learning_rate": 1e-06,
"loss": 0.2676,
"step": 1086
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.3240419775247574,
"epoch": 2.860526315789474,
"grad_norm": 15.382451057434082,
"learning_rate": 1e-06,
"loss": 0.2478,
"step": 1087
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.32489949464797974,
"epoch": 2.863157894736842,
"grad_norm": 0.019483868032693863,
"learning_rate": 1e-06,
"loss": 0.1895,
"step": 1088
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11328125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12305.0,
"completions/mean_length": 2428.5703125,
"completions/mean_terminated_length": 645.718017578125,
"completions/min_length": 170.0,
"completions/min_terminated_length": 170.0,
"entropy": 0.3152479976415634,
"epoch": 2.8657894736842104,
"frac_reward_zero_std": 0.21875,
"grad_norm": 148.0259246826172,
"learning_rate": 1e-06,
"loss": 0.1724,
"num_tokens": 347824423.0,
"reward": 0.7290340662002563,
"reward_std": 0.232526957988739,
"rewards/progression_diversity/mean": -0.0604589506983757,
"rewards/progression_diversity/std": 0.16615836322307587,
"rewards/symbolic_reward_accuracy/mean": 0.80078125,
"rewards/symbolic_reward_accuracy/std": 0.39980348944664,
"rewards/symbolic_reward_partial_score/mean": 0.8663736581802368,
"rewards/symbolic_reward_partial_score/std": 0.31190311908721924,
"rewards/tag_count_reward/mean": -0.107421875,
"rewards/tag_count_reward/std": 0.30995169281959534,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.127872109413147,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 240.0,
"sampling/sampling_logp_difference/mean": 1.2021994590759277,
"step": 1089
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.3240260183811188,
"epoch": 2.8684210526315788,
"grad_norm": 14.310028076171875,
"learning_rate": 1e-06,
"loss": 0.2462,
"step": 1090
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.31351564824581146,
"epoch": 2.8710526315789475,
"grad_norm": 0.017242752015590668,
"learning_rate": 1e-06,
"loss": 0.0796,
"step": 1091
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3828125,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5390625,
"entropy": 0.3323921114206314,
"epoch": 2.873684210526316,
"grad_norm": 0.0029714410193264484,
"learning_rate": 1e-06,
"loss": 0.3207,
"step": 1092
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.107421875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16382.0,
"completions/mean_length": 2427.0859375,
"completions/mean_terminated_length": 747.3698120117188,
"completions/min_length": 181.0,
"completions/min_terminated_length": 181.0,
"entropy": 0.31317219138145447,
"epoch": 2.876315789473684,
"frac_reward_zero_std": 0.1875,
"grad_norm": 36.501434326171875,
"learning_rate": 1e-06,
"loss": 0.2128,
"num_tokens": 349474067.0,
"reward": 0.7331509590148926,
"reward_std": 0.24093718826770782,
"rewards/progression_diversity/mean": -0.05892288684844971,
"rewards/progression_diversity/std": 0.1655765026807785,
"rewards/symbolic_reward_accuracy/mean": 0.806640625,
"rewards/symbolic_reward_accuracy/std": 0.39531853795051575,
"rewards/symbolic_reward_partial_score/mean": 0.86767578125,
"rewards/symbolic_reward_partial_score/std": 0.31132596731185913,
"rewards/tag_count_reward/mean": -0.10546875,
"rewards/tag_count_reward/std": 0.3074568510055542,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.153733253479004,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 241.0,
"sampling/sampling_logp_difference/mean": 1.242756724357605,
"step": 1093
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3359375,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4609375,
"entropy": 0.3204523026943207,
"epoch": 2.8789473684210525,
"grad_norm": 3.31962513923645,
"learning_rate": 1e-06,
"loss": 0.2336,
"step": 1094
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.31440404057502747,
"epoch": 2.8815789473684212,
"grad_norm": 7.084986686706543,
"learning_rate": 1e-06,
"loss": 0.1256,
"step": 1095
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5078125,
"entropy": 0.32728545367717743,
"epoch": 2.8842105263157896,
"grad_norm": 0.10335738956928253,
"learning_rate": 1e-06,
"loss": 0.2388,
"step": 1096
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.142578125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16244.0,
"completions/mean_length": 2942.568359375,
"completions/mean_terminated_length": 707.432861328125,
"completions/min_length": 204.0,
"completions/min_terminated_length": 204.0,
"entropy": 0.32194942235946655,
"epoch": 2.886842105263158,
"frac_reward_zero_std": 0.1875,
"grad_norm": 62.69070053100586,
"learning_rate": 1e-06,
"loss": 0.2512,
"num_tokens": 351364438.0,
"reward": 0.6914088726043701,
"reward_std": 0.22169052064418793,
"rewards/progression_diversity/mean": -0.07298420369625092,
"rewards/progression_diversity/std": 0.1761486977338791,
"rewards/symbolic_reward_accuracy/mean": 0.7578125,
"rewards/symbolic_reward_accuracy/std": 0.42882615327835083,
"rewards/symbolic_reward_partial_score/mean": 0.8409830331802368,
"rewards/symbolic_reward_partial_score/std": 0.3356630504131317,
"rewards/tag_count_reward/mean": -0.1484375,
"rewards/tag_count_reward/std": 0.35588082671165466,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.162922978401184,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 243.0,
"sampling/sampling_logp_difference/mean": 1.4567382335662842,
"step": 1097
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.32985997200012207,
"epoch": 2.889473684210526,
"grad_norm": 2.9098451137542725,
"learning_rate": 1e-06,
"loss": 0.2758,
"step": 1098
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.3266884684562683,
"epoch": 2.8921052631578945,
"grad_norm": 0.03355651721358299,
"learning_rate": 1e-06,
"loss": 0.1963,
"step": 1099
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.32841236889362335,
"epoch": 2.8947368421052633,
"grad_norm": 6.091196537017822,
"learning_rate": 1e-06,
"loss": 0.2498,
"step": 1100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.103515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16313.0,
"completions/mean_length": 2224.619140625,
"completions/mean_terminated_length": 589.657958984375,
"completions/min_length": 233.0,
"completions/min_terminated_length": 233.0,
"entropy": 0.3271956741809845,
"epoch": 2.8973684210526316,
"frac_reward_zero_std": 0.25,
"grad_norm": 43.073707580566406,
"learning_rate": 1e-06,
"loss": 0.1519,
"num_tokens": 352894899.0,
"reward": 0.7742513418197632,
"reward_std": 0.20133471488952637,
"rewards/progression_diversity/mean": -0.05045757442712784,
"rewards/progression_diversity/std": 0.14818480610847473,
"rewards/symbolic_reward_accuracy/mean": 0.859375,
"rewards/symbolic_reward_accuracy/std": 0.3479743003845215,
"rewards/symbolic_reward_partial_score/mean": 0.89697265625,
"rewards/symbolic_reward_partial_score/std": 0.2875053882598877,
"rewards/tag_count_reward/mean": -0.099609375,
"rewards/tag_count_reward/std": 0.29977133870124817,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1496270895004272,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 246.0,
"sampling/sampling_logp_difference/mean": 1.6051921844482422,
"step": 1101
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.3244181126356125,
"epoch": 2.9,
"grad_norm": 0.012723026797175407,
"learning_rate": 1e-06,
"loss": 0.2311,
"step": 1102
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.453125,
"entropy": 0.33277395367622375,
"epoch": 2.9026315789473687,
"grad_norm": 2.2688472270965576,
"learning_rate": 1e-06,
"loss": 0.2433,
"step": 1103
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3671875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4609375,
"entropy": 0.33416812121868134,
"epoch": 2.905263157894737,
"grad_norm": 5.079883098602295,
"learning_rate": 1e-06,
"loss": 0.1993,
"step": 1104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.080078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12811.0,
"completions/mean_length": 1826.296875,
"completions/mean_terminated_length": 559.0658569335938,
"completions/min_length": 182.0,
"completions/min_terminated_length": 182.0,
"entropy": 0.3188180774450302,
"epoch": 2.9078947368421053,
"frac_reward_zero_std": 0.21875,
"grad_norm": 23.28694725036621,
"learning_rate": 1e-06,
"loss": 0.1536,
"num_tokens": 354244779.0,
"reward": 0.7635840177536011,
"reward_std": 0.22885069251060486,
"rewards/progression_diversity/mean": -0.038090769201517105,
"rewards/progression_diversity/std": 0.12966662645339966,
"rewards/symbolic_reward_accuracy/mean": 0.833984375,
"rewards/symbolic_reward_accuracy/std": 0.3724585771560669,
"rewards/symbolic_reward_partial_score/mean": 0.9059244394302368,
"rewards/symbolic_reward_partial_score/std": 0.2578800618648529,
"rewards/tag_count_reward/mean": -0.08203125,
"rewards/tag_count_reward/std": 0.2746807038784027,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1475815773010254,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 248.0,
"sampling/sampling_logp_difference/mean": 1.869348168373108,
"step": 1105
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.3273351788520813,
"epoch": 2.9105263157894736,
"grad_norm": 4.7783918380737305,
"learning_rate": 1e-06,
"loss": 0.1048,
"step": 1106
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.3305172771215439,
"epoch": 2.913157894736842,
"grad_norm": 0.007633809465914965,
"learning_rate": 1e-06,
"loss": 0.2638,
"step": 1107
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.333538293838501,
"epoch": 2.9157894736842103,
"grad_norm": 0.015457144938409328,
"learning_rate": 1e-06,
"loss": 0.2158,
"step": 1108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.099609375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16183.0,
"completions/mean_length": 2176.537109375,
"completions/mean_terminated_length": 604.7787475585938,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"entropy": 0.3226548731327057,
"epoch": 2.918421052631579,
"frac_reward_zero_std": 0.15625,
"grad_norm": 108.32244873046875,
"learning_rate": 1e-06,
"loss": 0.1848,
"num_tokens": 355747614.0,
"reward": 0.7507563233375549,
"reward_std": 0.25011780858039856,
"rewards/progression_diversity/mean": -0.04643935710191727,
"rewards/progression_diversity/std": 0.1406102478504181,
"rewards/symbolic_reward_accuracy/mean": 0.826171875,
"rewards/symbolic_reward_accuracy/std": 0.3793322443962097,
"rewards/symbolic_reward_partial_score/mean": 0.8816731572151184,
"rewards/symbolic_reward_partial_score/std": 0.2946558892726898,
"rewards/tag_count_reward/mean": -0.08984375,
"rewards/tag_count_reward/std": 0.2862374484539032,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1501786708831787,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 252.0,
"sampling/sampling_logp_difference/mean": 2.1973791122436523,
"step": 1109
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.32274968922138214,
"epoch": 2.9210526315789473,
"grad_norm": 0.011330176144838333,
"learning_rate": 1e-06,
"loss": 0.3078,
"step": 1110
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.31783410906791687,
"epoch": 2.9236842105263157,
"grad_norm": 0.008597995154559612,
"learning_rate": 1e-06,
"loss": 0.1984,
"step": 1111
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.3165570944547653,
"epoch": 2.9263157894736844,
"grad_norm": 0.3951316773891449,
"learning_rate": 1e-06,
"loss": 0.2277,
"step": 1112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11247.0,
"completions/mean_length": 2216.048828125,
"completions/mean_terminated_length": 614.454345703125,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"entropy": 0.3109191954135895,
"epoch": 2.9289473684210527,
"frac_reward_zero_std": 0.15625,
"grad_norm": 183.47438049316406,
"learning_rate": 1e-06,
"loss": 0.3146,
"num_tokens": 357277879.0,
"reward": 0.769425630569458,
"reward_std": 0.23327568173408508,
"rewards/progression_diversity/mean": -0.04474321007728577,
"rewards/progression_diversity/std": 0.13221436738967896,
"rewards/symbolic_reward_accuracy/mean": 0.853515625,
"rewards/symbolic_reward_accuracy/std": 0.35393697023391724,
"rewards/symbolic_reward_partial_score/mean": 0.8937174081802368,
"rewards/symbolic_reward_partial_score/std": 0.2835972309112549,
"rewards/tag_count_reward/mean": -0.103515625,
"rewards/tag_count_reward/std": 0.30492907762527466,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1350001096725464,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 254.0,
"sampling/sampling_logp_difference/mean": 2.6123390197753906,
"step": 1113
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.32664044201374054,
"epoch": 2.931578947368421,
"grad_norm": 84.57691192626953,
"learning_rate": 1e-06,
"loss": 0.2191,
"step": 1114
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.31761452555656433,
"epoch": 2.9342105263157894,
"grad_norm": 0.06620530039072037,
"learning_rate": 1e-06,
"loss": 0.1672,
"step": 1115
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.3253130316734314,
"epoch": 2.9368421052631577,
"grad_norm": 0.013269034214317799,
"learning_rate": 1e-06,
"loss": 0.1481,
"step": 1116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.14453125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9632.0,
"completions/mean_length": 2862.21875,
"completions/mean_terminated_length": 577.7168579101562,
"completions/min_length": 191.0,
"completions/min_terminated_length": 191.0,
"entropy": 0.33379143476486206,
"epoch": 2.9394736842105265,
"frac_reward_zero_std": 0.0625,
"grad_norm": 188.9947052001953,
"learning_rate": 1e-06,
"loss": 0.2634,
"num_tokens": 359162663.0,
"reward": 0.7037363648414612,
"reward_std": 0.26258164644241333,
"rewards/progression_diversity/mean": -0.05605129897594452,
"rewards/progression_diversity/std": 0.14197811484336853,
"rewards/symbolic_reward_accuracy/mean": 0.76953125,
"rewards/symbolic_reward_accuracy/std": 0.42154473066329956,
"rewards/symbolic_reward_partial_score/mean": 0.8522135019302368,
"rewards/symbolic_reward_partial_score/std": 0.3169857859611511,
"rewards/tag_count_reward/mean": -0.130859375,
"rewards/tag_count_reward/std": 0.33757632970809937,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1356170177459717,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 258.0,
"sampling/sampling_logp_difference/mean": 3.2635445594787598,
"step": 1117
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.203125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5078125,
"entropy": 0.323417529463768,
"epoch": 2.942105263157895,
"grad_norm": 13.859256744384766,
"learning_rate": 1e-06,
"loss": 0.2917,
"step": 1118
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.3256514072418213,
"epoch": 2.944736842105263,
"grad_norm": 0.9161269664764404,
"learning_rate": 1e-06,
"loss": 0.2028,
"step": 1119
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.484375,
"entropy": 0.3238846957683563,
"epoch": 2.9473684210526314,
"grad_norm": 2.7128119468688965,
"learning_rate": 1e-06,
"loss": 0.1348,
"step": 1120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16291.0,
"completions/mean_length": 2118.107421875,
"completions/mean_terminated_length": 642.325439453125,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"entropy": 0.3269234597682953,
"epoch": 2.95,
"frac_reward_zero_std": 0.15625,
"grad_norm": 149.16152954101562,
"learning_rate": 1e-06,
"loss": 0.1463,
"num_tokens": 360662974.0,
"reward": 0.7428156137466431,
"reward_std": 0.2521783113479614,
"rewards/progression_diversity/mean": -0.039731502532958984,
"rewards/progression_diversity/std": 0.12264791131019592,
"rewards/symbolic_reward_accuracy/mean": 0.810546875,
"rewards/symbolic_reward_accuracy/std": 0.3922513723373413,
"rewards/symbolic_reward_partial_score/mean": 0.8855794072151184,
"rewards/symbolic_reward_partial_score/std": 0.2812291979789734,
"rewards/tag_count_reward/mean": -0.087890625,
"rewards/tag_count_reward/std": 0.2834126651287079,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1260616779327393,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 260.0,
"sampling/sampling_logp_difference/mean": 2.9290082454681396,
"step": 1121
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.3308548033237457,
"epoch": 2.9526315789473685,
"grad_norm": 105.0246353149414,
"learning_rate": 1e-06,
"loss": 0.2037,
"step": 1122
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.453125,
"entropy": 0.3392486423254013,
"epoch": 2.955263157894737,
"grad_norm": 1.2266844511032104,
"learning_rate": 1e-06,
"loss": 0.2545,
"step": 1123
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4921875,
"entropy": 0.33492106199264526,
"epoch": 2.957894736842105,
"grad_norm": 0.017569491639733315,
"learning_rate": 1e-06,
"loss": 0.2375,
"step": 1124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.111328125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16326.0,
"completions/mean_length": 2407.78125,
"completions/mean_terminated_length": 656.914306640625,
"completions/min_length": 216.0,
"completions/min_terminated_length": 216.0,
"entropy": 0.3292433023452759,
"epoch": 2.9605263157894735,
"frac_reward_zero_std": 0.3125,
"grad_norm": 124.67893981933594,
"learning_rate": 1e-06,
"loss": 0.1289,
"num_tokens": 362307246.0,
"reward": 0.7468899488449097,
"reward_std": 0.2119377851486206,
"rewards/progression_diversity/mean": -0.047334060072898865,
"rewards/progression_diversity/std": 0.13447138667106628,
"rewards/symbolic_reward_accuracy/mean": 0.822265625,
"rewards/symbolic_reward_accuracy/std": 0.3826628625392914,
"rewards/symbolic_reward_partial_score/mean": 0.8844400644302368,
"rewards/symbolic_reward_partial_score/std": 0.29540929198265076,
"rewards/tag_count_reward/mean": -0.11328125,
"rewards/tag_count_reward/std": 0.3172462284564972,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.115929365158081,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 261.95855712890625,
"sampling/sampling_logp_difference/mean": 2.85267972946167,
"step": 1125
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.33551347255706787,
"epoch": 2.963157894736842,
"grad_norm": 46.468929290771484,
"learning_rate": 1e-06,
"loss": 0.2668,
"step": 1126
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.3421187102794647,
"epoch": 2.9657894736842105,
"grad_norm": 0.010265377350151539,
"learning_rate": 1e-06,
"loss": 0.1918,
"step": 1127
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.33866679668426514,
"epoch": 2.968421052631579,
"grad_norm": 0.012180095538496971,
"learning_rate": 1e-06,
"loss": 0.2088,
"step": 1128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.115234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12667.0,
"completions/mean_length": 2409.67578125,
"completions/mean_terminated_length": 589.6203002929688,
"completions/min_length": 215.0,
"completions/min_terminated_length": 215.0,
"entropy": 0.32998014986515045,
"epoch": 2.9710526315789476,
"frac_reward_zero_std": 0.1875,
"grad_norm": 134.6818084716797,
"learning_rate": 1e-06,
"loss": 0.2747,
"num_tokens": 363944808.0,
"reward": 0.7039549350738525,
"reward_std": 0.259724497795105,
"rewards/progression_diversity/mean": -0.04884590953588486,
"rewards/progression_diversity/std": 0.13787634670734406,
"rewards/symbolic_reward_accuracy/mean": 0.76953125,
"rewards/symbolic_reward_accuracy/std": 0.42154473066329956,
"rewards/symbolic_reward_partial_score/mean": 0.8474935293197632,
"rewards/symbolic_reward_partial_score/std": 0.3241187334060669,
"rewards/tag_count_reward/mean": -0.115234375,
"rewards/tag_count_reward/std": 0.3196168541908264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1256601810455322,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 262.0,
"sampling/sampling_logp_difference/mean": 3.3490161895751953,
"step": 1129
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.3265266716480255,
"epoch": 2.973684210526316,
"grad_norm": 0.2271856814622879,
"learning_rate": 1e-06,
"loss": 0.2083,
"step": 1130
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.3257727771997452,
"epoch": 2.9763157894736842,
"grad_norm": 0.01293912809342146,
"learning_rate": 1e-06,
"loss": 0.2484,
"step": 1131
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.46875,
"entropy": 0.32893073558807373,
"epoch": 2.9789473684210526,
"grad_norm": 0.01243510190397501,
"learning_rate": 1e-06,
"loss": 0.1611,
"step": 1132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16267.0,
"completions/mean_length": 2278.9140625,
"completions/mean_terminated_length": 684.426025390625,
"completions/min_length": 236.0,
"completions/min_terminated_length": 236.0,
"entropy": 0.3159499317407608,
"epoch": 2.981578947368421,
"frac_reward_zero_std": 0.03125,
"grad_norm": 262.58709716796875,
"learning_rate": 1e-06,
"loss": 0.2098,
"num_tokens": 365549628.0,
"reward": 0.6847292184829712,
"reward_std": 0.29013562202453613,
"rewards/progression_diversity/mean": -0.042704228311777115,
"rewards/progression_diversity/std": 0.12792158126831055,
"rewards/symbolic_reward_accuracy/mean": 0.736328125,
"rewards/symbolic_reward_accuracy/std": 0.4410543739795685,
"rewards/symbolic_reward_partial_score/mean": 0.8444010019302368,
"rewards/symbolic_reward_partial_score/std": 0.31147703528404236,
"rewards/tag_count_reward/mean": -0.099609375,
"rewards/tag_count_reward/std": 0.29977133870124817,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1283180713653564,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 264.0,
"sampling/sampling_logp_difference/mean": 3.4847636222839355,
"step": 1133
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.1953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.546875,
"entropy": 0.3305753022432327,
"epoch": 2.984210526315789,
"grad_norm": 2.827713966369629,
"learning_rate": 1e-06,
"loss": 0.2373,
"step": 1134
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4921875,
"entropy": 0.3246810734272003,
"epoch": 2.986842105263158,
"grad_norm": 30.311552047729492,
"learning_rate": 1e-06,
"loss": 0.199,
"step": 1135
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5,
"entropy": 0.32095228135585785,
"epoch": 2.9894736842105263,
"grad_norm": 0.029631303623318672,
"learning_rate": 1e-06,
"loss": 0.2114,
"step": 1136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16215.0,
"completions/mean_length": 1764.580078125,
"completions/mean_terminated_length": 658.90966796875,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"entropy": 0.3265077769756317,
"epoch": 2.9921052631578946,
"frac_reward_zero_std": 0.4375,
"grad_norm": 68.62413787841797,
"learning_rate": 1e-06,
"loss": 0.136,
"num_tokens": 366860069.0,
"reward": 0.7820947170257568,
"reward_std": 0.12925215065479279,
"rewards/progression_diversity/mean": -0.02784038335084915,
"rewards/progression_diversity/std": 0.09940054267644882,
"rewards/symbolic_reward_accuracy/mean": 0.857421875,
"rewards/symbolic_reward_accuracy/std": 0.3499840497970581,
"rewards/symbolic_reward_partial_score/mean": 0.91845703125,
"rewards/symbolic_reward_partial_score/std": 0.24726058542728424,
"rewards/tag_count_reward/mean": -0.076171875,
"rewards/tag_count_reward/std": 0.26553234457969666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.105454683303833,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 264.0,
"sampling/sampling_logp_difference/mean": 2.1433823108673096,
"step": 1137
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.33374376595020294,
"epoch": 2.9947368421052634,
"grad_norm": 35.08319854736328,
"learning_rate": 1e-06,
"loss": 0.202,
"step": 1138
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.32793696224689484,
"epoch": 2.9973684210526317,
"grad_norm": 5.97829532623291,
"learning_rate": 1e-06,
"loss": 0.1058,
"step": 1139
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.3303612768650055,
"epoch": 3.0,
"grad_norm": 61.562416076660156,
"learning_rate": 1e-06,
"loss": 0.1287,
"step": 1140
},
{
"epoch": 3.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.04638671875,
"eval_completions/max_length": 16384.0,
"eval_completions/max_terminated_length": 5637.46875,
"eval_completions/mean_length": 1238.202880859375,
"eval_completions/mean_terminated_length": 502.11154556274414,
"eval_completions/min_length": 207.375,
"eval_completions/min_terminated_length": 207.375,
"eval_entropy": 0.33187594451010227,
"eval_frac_reward_zero_std": 0.28515625,
"eval_loss": 0.05398595333099365,
"eval_num_tokens": 366860069.0,
"eval_reward": 0.7990773729979992,
"eval_reward_std": 0.21094764932058752,
"eval_rewards/progression_diversity/mean": -0.020244480154360645,
"eval_rewards/progression_diversity/std": 0.08825050009181723,
"eval_rewards/symbolic_reward_accuracy/mean": 0.87890625,
"eval_rewards/symbolic_reward_accuracy/std": 0.3186751971952617,
"eval_rewards/symbolic_reward_partial_score/mean": 0.9210205078125,
"eval_rewards/symbolic_reward_partial_score/std": 0.22956605115905404,
"eval_rewards/tag_count_reward/mean": -0.043701171875,
"eval_rewards/tag_count_reward/std": 0.19665716541931033,
"eval_runtime": 4075.6424,
"eval_samples_per_second": 0.061,
"eval_sampling/importance_sampling_ratio/max": 2.0,
"eval_sampling/importance_sampling_ratio/mean": 1.084825448691845,
"eval_sampling/importance_sampling_ratio/min": 0.0,
"eval_sampling/sampling_logp_difference/max": 265.8116159439087,
"eval_sampling/sampling_logp_difference/mean": 1.002108539454639,
"eval_steps_per_second": 0.0,
"step": 1140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.099609375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2218.0,
"completions/mean_length": 2117.115234375,
"completions/mean_terminated_length": 538.7830810546875,
"completions/min_length": 202.0,
"completions/min_terminated_length": 202.0,
"entropy": 0.3300452083349228,
"epoch": 3.0026315789473683,
"frac_reward_zero_std": 0.3125,
"grad_norm": 52.12168884277344,
"learning_rate": 1e-06,
"loss": 0.0937,
"num_tokens": 368347840.0,
"reward": 0.7367914915084839,
"reward_std": 0.1933838427066803,
"rewards/progression_diversity/mean": -0.03667115420103073,
"rewards/progression_diversity/std": 0.11381202191114426,
"rewards/symbolic_reward_accuracy/mean": 0.806640625,
"rewards/symbolic_reward_accuracy/std": 0.39531853795051575,
"rewards/symbolic_reward_partial_score/mean": 0.87451171875,
"rewards/symbolic_reward_partial_score/std": 0.3029172122478485,
"rewards/tag_count_reward/mean": -0.091796875,
"rewards/tag_count_reward/std": 0.289021372795105,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1042859554290771,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 266.0,
"sampling/sampling_logp_difference/mean": 2.27083158493042,
"step": 1141
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.32495447993278503,
"epoch": 3.0052631578947366,
"grad_norm": 1.0844265222549438,
"learning_rate": 1e-06,
"loss": 0.1562,
"step": 1142
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.31977418065071106,
"epoch": 3.0078947368421054,
"grad_norm": 32.22896194458008,
"learning_rate": 1e-06,
"loss": 0.0852,
"step": 1143
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.33107973635196686,
"epoch": 3.0105263157894737,
"grad_norm": 0.01271702442318201,
"learning_rate": 1e-06,
"loss": 0.183,
"step": 1144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.068359375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2963.0,
"completions/mean_length": 1620.275390625,
"completions/mean_terminated_length": 536.9832153320312,
"completions/min_length": 178.0,
"completions/min_terminated_length": 178.0,
"entropy": 0.3337739109992981,
"epoch": 3.013157894736842,
"frac_reward_zero_std": 0.28125,
"grad_norm": 256.0187683105469,
"learning_rate": 1e-06,
"loss": 0.1703,
"num_tokens": 369584397.0,
"reward": 0.7931051254272461,
"reward_std": 0.1887580156326294,
"rewards/progression_diversity/mean": -0.02543444186449051,
"rewards/progression_diversity/std": 0.09666049480438232,
"rewards/symbolic_reward_accuracy/mean": 0.87109375,
"rewards/symbolic_reward_accuracy/std": 0.33542385697364807,
"rewards/symbolic_reward_partial_score/mean": 0.923828125,
"rewards/symbolic_reward_partial_score/std": 0.23379817605018616,
"rewards/tag_count_reward/mean": -0.064453125,
"rewards/tag_count_reward/std": 0.24579854309558868,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.108473300933838,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 268.0,
"sampling/sampling_logp_difference/mean": 2.444417953491211,
"step": 1145
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.33417925238609314,
"epoch": 3.0157894736842104,
"grad_norm": 152.89532470703125,
"learning_rate": 1e-06,
"loss": 0.2239,
"step": 1146
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.3302992135286331,
"epoch": 3.018421052631579,
"grad_norm": 2.4071531295776367,
"learning_rate": 1e-06,
"loss": 0.1218,
"step": 1147
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.32197025418281555,
"epoch": 3.0210526315789474,
"grad_norm": 0.013167927972972393,
"learning_rate": 1e-06,
"loss": 0.0594,
"step": 1148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13552.0,
"completions/mean_length": 1762.455078125,
"completions/mean_terminated_length": 590.263671875,
"completions/min_length": 210.0,
"completions/min_terminated_length": 210.0,
"entropy": 0.31848709285259247,
"epoch": 3.0236842105263158,
"frac_reward_zero_std": 0.25,
"grad_norm": 167.57388305664062,
"learning_rate": 1e-06,
"loss": 0.0925,
"num_tokens": 370898422.0,
"reward": 0.775731086730957,
"reward_std": 0.21161219477653503,
"rewards/progression_diversity/mean": -0.029432490468025208,
"rewards/progression_diversity/std": 0.1058623269200325,
"rewards/symbolic_reward_accuracy/mean": 0.85546875,
"rewards/symbolic_reward_accuracy/std": 0.35197147727012634,
"rewards/symbolic_reward_partial_score/mean": 0.8986002802848816,
"rewards/symbolic_reward_partial_score/std": 0.2755959630012512,
"rewards/tag_count_reward/mean": -0.068359375,
"rewards/tag_count_reward/std": 0.25260838866233826,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1008307933807373,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 268.0,
"sampling/sampling_logp_difference/mean": 2.5271763801574707,
"step": 1149
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.32427777349948883,
"epoch": 3.026315789473684,
"grad_norm": 0.09036588668823242,
"learning_rate": 1e-06,
"loss": 0.1532,
"step": 1150
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.32039374113082886,
"epoch": 3.028947368421053,
"grad_norm": 0.03532138466835022,
"learning_rate": 1e-06,
"loss": 0.1982,
"step": 1151
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.3186514228582382,
"epoch": 3.031578947368421,
"grad_norm": 0.011541608721017838,
"learning_rate": 1e-06,
"loss": 0.1511,
"step": 1152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9474.0,
"completions/mean_length": 1170.345703125,
"completions/mean_terminated_length": 551.9044799804688,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"entropy": 0.3213903158903122,
"epoch": 3.0342105263157895,
"frac_reward_zero_std": 0.4375,
"grad_norm": 25.063764572143555,
"learning_rate": 1e-06,
"loss": 0.0644,
"num_tokens": 371893767.0,
"reward": 0.8350409269332886,
"reward_std": 0.14898480474948883,
"rewards/progression_diversity/mean": -0.016419440507888794,
"rewards/progression_diversity/std": 0.08188439160585403,
"rewards/symbolic_reward_accuracy/mean": 0.92578125,
"rewards/symbolic_reward_accuracy/std": 0.2623828947544098,
"rewards/symbolic_reward_partial_score/mean": 0.9454752802848816,
"rewards/symbolic_reward_partial_score/std": 0.20989501476287842,
"rewards/tag_count_reward/mean": -0.0390625,
"rewards/tag_count_reward/std": 0.1939331740140915,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.088158369064331,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 268.0,
"sampling/sampling_logp_difference/mean": 1.5961803197860718,
"step": 1153
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.3240705579519272,
"epoch": 3.036842105263158,
"grad_norm": 0.00562731409445405,
"learning_rate": 1e-06,
"loss": 0.1014,
"step": 1154
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.32319276034832,
"epoch": 3.039473684210526,
"grad_norm": 0.006894012447446585,
"learning_rate": 1e-06,
"loss": 0.097,
"step": 1155
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.3254219591617584,
"epoch": 3.042105263157895,
"grad_norm": 0.0085447384044528,
"learning_rate": 1e-06,
"loss": 0.0831,
"step": 1156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.068359375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14955.0,
"completions/mean_length": 1684.576171875,
"completions/mean_terminated_length": 606.0020751953125,
"completions/min_length": 192.0,
"completions/min_terminated_length": 192.0,
"entropy": 0.3363809883594513,
"epoch": 3.044736842105263,
"frac_reward_zero_std": 0.28125,
"grad_norm": 185.30889892578125,
"learning_rate": 1e-06,
"loss": 0.1307,
"num_tokens": 373167598.0,
"reward": 0.7592610120773315,
"reward_std": 0.2298196256160736,
"rewards/progression_diversity/mean": -0.026055842638015747,
"rewards/progression_diversity/std": 0.09842479228973389,
"rewards/symbolic_reward_accuracy/mean": 0.830078125,
"rewards/symbolic_reward_accuracy/std": 0.3759314715862274,
"rewards/symbolic_reward_partial_score/mean": 0.89306640625,
"rewards/symbolic_reward_partial_score/std": 0.2722471356391907,
"rewards/tag_count_reward/mean": -0.064453125,
"rewards/tag_count_reward/std": 0.24579854309558868,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0948313474655151,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 270.0,
"sampling/sampling_logp_difference/mean": 2.451362133026123,
"step": 1157
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.32949453592300415,
"epoch": 3.0473684210526315,
"grad_norm": 0.15845933556556702,
"learning_rate": 1e-06,
"loss": 0.15,
"step": 1158
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.3209853768348694,
"epoch": 3.05,
"grad_norm": 0.012255529873073101,
"learning_rate": 1e-06,
"loss": 0.1257,
"step": 1159
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.3235623836517334,
"epoch": 3.0526315789473686,
"grad_norm": 0.0065957182087004185,
"learning_rate": 1e-06,
"loss": 0.1606,
"step": 1160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.076171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10149.0,
"completions/mean_length": 1787.939453125,
"completions/mean_terminated_length": 584.458740234375,
"completions/min_length": 204.0,
"completions/min_terminated_length": 204.0,
"entropy": 0.3329770565032959,
"epoch": 3.055263157894737,
"frac_reward_zero_std": 0.3125,
"grad_norm": 219.4588165283203,
"learning_rate": 1e-06,
"loss": 0.1499,
"num_tokens": 374477807.0,
"reward": 0.7548931837081909,
"reward_std": 0.1553860455751419,
"rewards/progression_diversity/mean": -0.02826322615146637,
"rewards/progression_diversity/std": 0.10057384520769119,
"rewards/symbolic_reward_accuracy/mean": 0.822265625,
"rewards/symbolic_reward_accuracy/std": 0.3826628625392914,
"rewards/symbolic_reward_partial_score/mean": 0.8968098759651184,
"rewards/symbolic_reward_partial_score/std": 0.26359865069389343,
"rewards/tag_count_reward/mean": -0.072265625,
"rewards/tag_count_reward/std": 0.2591804563999176,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0940297842025757,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 272.0,
"sampling/sampling_logp_difference/mean": 2.350238084793091,
"step": 1161
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3359375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.3281650245189667,
"epoch": 3.057894736842105,
"grad_norm": 0.007984149269759655,
"learning_rate": 1e-06,
"loss": 0.2052,
"step": 1162
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.3284861296415329,
"epoch": 3.0605263157894735,
"grad_norm": 0.035218384116888046,
"learning_rate": 1e-06,
"loss": 0.12,
"step": 1163
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.3274737000465393,
"epoch": 3.0631578947368423,
"grad_norm": 0.012610013596713543,
"learning_rate": 1e-06,
"loss": 0.1512,
"step": 1164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 1962.0,
"completions/mean_length": 1523.06640625,
"completions/mean_terminated_length": 532.3375244140625,
"completions/min_length": 208.0,
"completions/min_terminated_length": 208.0,
"entropy": 0.3336295783519745,
"epoch": 3.0657894736842106,
"frac_reward_zero_std": 0.4375,
"grad_norm": 120.76913452148438,
"learning_rate": 1e-06,
"loss": 0.1477,
"num_tokens": 375667281.0,
"reward": 0.8087124824523926,
"reward_std": 0.15946154296398163,
"rewards/progression_diversity/mean": -0.022307991981506348,
"rewards/progression_diversity/std": 0.08901267498731613,
"rewards/symbolic_reward_accuracy/mean": 0.892578125,
"rewards/symbolic_reward_accuracy/std": 0.30995169281959534,
"rewards/symbolic_reward_partial_score/mean": 0.93212890625,
"rewards/symbolic_reward_partial_score/std": 0.231436625123024,
"rewards/tag_count_reward/mean": -0.0625,
"rewards/tag_count_reward/std": 0.2422981858253479,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0914645195007324,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 272.0,
"sampling/sampling_logp_difference/mean": 1.7476840019226074,
"step": 1165
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.32854387164115906,
"epoch": 3.068421052631579,
"grad_norm": 0.015112695284187794,
"learning_rate": 1e-06,
"loss": 0.1242,
"step": 1166
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.3354027718305588,
"epoch": 3.0710526315789473,
"grad_norm": 0.6051515340805054,
"learning_rate": 1e-06,
"loss": 0.0242,
"step": 1167
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.3323533236980438,
"epoch": 3.0736842105263156,
"grad_norm": 0.01162874884903431,
"learning_rate": 1e-06,
"loss": 0.2038,
"step": 1168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2259.0,
"completions/mean_length": 1519.060546875,
"completions/mean_terminated_length": 528.0646362304688,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"entropy": 0.34026047587394714,
"epoch": 3.0763157894736843,
"frac_reward_zero_std": 0.34375,
"grad_norm": 21.67325210571289,
"learning_rate": 1e-06,
"loss": 0.0595,
"num_tokens": 376820816.0,
"reward": 0.8038376569747925,
"reward_std": 0.16369682550430298,
"rewards/progression_diversity/mean": -0.021513454616069794,
"rewards/progression_diversity/std": 0.08785874396562576,
"rewards/symbolic_reward_accuracy/mean": 0.884765625,
"rewards/symbolic_reward_accuracy/std": 0.3196168541908264,
"rewards/symbolic_reward_partial_score/mean": 0.9295247197151184,
"rewards/symbolic_reward_partial_score/std": 0.22601304948329926,
"rewards/tag_count_reward/mean": -0.056640625,
"rewards/tag_count_reward/std": 0.23138070106506348,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0951886177062988,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 272.0,
"sampling/sampling_logp_difference/mean": 1.9224200248718262,
"step": 1169
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.33099737763404846,
"epoch": 3.0789473684210527,
"grad_norm": 0.010425153188407421,
"learning_rate": 1e-06,
"loss": 0.1494,
"step": 1170
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.3235698342323303,
"epoch": 3.081578947368421,
"grad_norm": 0.013261355459690094,
"learning_rate": 1e-06,
"loss": 0.1044,
"step": 1171
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.3331771790981293,
"epoch": 3.0842105263157893,
"grad_norm": 0.008659729734063148,
"learning_rate": 1e-06,
"loss": 0.2015,
"step": 1172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15931.0,
"completions/mean_length": 1108.689453125,
"completions/mean_terminated_length": 552.0991821289062,
"completions/min_length": 196.0,
"completions/min_terminated_length": 196.0,
"entropy": 0.33606918156147003,
"epoch": 3.086842105263158,
"frac_reward_zero_std": 0.46875,
"grad_norm": 340.97711181640625,
"learning_rate": 1e-06,
"loss": 0.1414,
"num_tokens": 377781745.0,
"reward": 0.8523913025856018,
"reward_std": 0.13625648617744446,
"rewards/progression_diversity/mean": -0.014778539538383484,
"rewards/progression_diversity/std": 0.07598426192998886,
"rewards/symbolic_reward_accuracy/mean": 0.947265625,
"rewards/symbolic_reward_accuracy/std": 0.22372129559516907,
"rewards/symbolic_reward_partial_score/mean": 0.9596354365348816,
"rewards/symbolic_reward_partial_score/std": 0.185786634683609,
"rewards/tag_count_reward/mean": -0.037109375,
"rewards/tag_count_reward/std": 0.18921469151973724,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0950132608413696,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 274.0,
"sampling/sampling_logp_difference/mean": 1.4215747117996216,
"step": 1173
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.3379185199737549,
"epoch": 3.0894736842105264,
"grad_norm": 0.018991535529494286,
"learning_rate": 1e-06,
"loss": 0.1565,
"step": 1174
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.33449527621269226,
"epoch": 3.0921052631578947,
"grad_norm": 0.005514142569154501,
"learning_rate": 1e-06,
"loss": 0.062,
"step": 1175
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.32939158380031586,
"epoch": 3.094736842105263,
"grad_norm": 0.005196165293455124,
"learning_rate": 1e-06,
"loss": 0.0599,
"step": 1176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15172.0,
"completions/mean_length": 1775.599609375,
"completions/mean_terminated_length": 604.4619750976562,
"completions/min_length": 245.0,
"completions/min_terminated_length": 245.0,
"entropy": 0.3369821459054947,
"epoch": 3.0973684210526318,
"frac_reward_zero_std": 0.3125,
"grad_norm": 145.88417053222656,
"learning_rate": 1e-06,
"loss": 0.1591,
"num_tokens": 379096324.0,
"reward": 0.7732741236686707,
"reward_std": 0.21388080716133118,
"rewards/progression_diversity/mean": -0.026106324046850204,
"rewards/progression_diversity/std": 0.09331251680850983,
"rewards/symbolic_reward_accuracy/mean": 0.84765625,
"rewards/symbolic_reward_accuracy/std": 0.35970520973205566,
"rewards/symbolic_reward_partial_score/mean": 0.9078775644302368,
"rewards/symbolic_reward_partial_score/std": 0.2603663206100464,
"rewards/tag_count_reward/mean": -0.07421875,
"rewards/tag_count_reward/std": 0.2623828947544098,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.103142499923706,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 276.0,
"sampling/sampling_logp_difference/mean": 2.0799942016601562,
"step": 1177
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.34052442014217377,
"epoch": 3.1,
"grad_norm": 0.011826397851109505,
"learning_rate": 1e-06,
"loss": 0.1395,
"step": 1178
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.3255487084388733,
"epoch": 3.1026315789473684,
"grad_norm": 0.008594873361289501,
"learning_rate": 1e-06,
"loss": 0.1081,
"step": 1179
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.3336612284183502,
"epoch": 3.1052631578947367,
"grad_norm": 0.01105382852256298,
"learning_rate": 1e-06,
"loss": 0.1773,
"step": 1180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.083984375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2873.0,
"completions/mean_length": 1904.052734375,
"completions/mean_terminated_length": 576.4669799804688,
"completions/min_length": 219.0,
"completions/min_terminated_length": 219.0,
"entropy": 0.3368203490972519,
"epoch": 3.1078947368421055,
"frac_reward_zero_std": 0.28125,
"grad_norm": 61.46799087524414,
"learning_rate": 1e-06,
"loss": 0.0739,
"num_tokens": 380476511.0,
"reward": 0.7395263910293579,
"reward_std": 0.2011542022228241,
"rewards/progression_diversity/mean": -0.026853924617171288,
"rewards/progression_diversity/std": 0.09222765266895294,
"rewards/symbolic_reward_accuracy/mean": 0.802734375,
"rewards/symbolic_reward_accuracy/std": 0.3983237147331238,
"rewards/symbolic_reward_partial_score/mean": 0.88525390625,
"rewards/symbolic_reward_partial_score/std": 0.2737979590892792,
"rewards/tag_count_reward/mean": -0.07421875,
"rewards/tag_count_reward/std": 0.2623828947544098,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0973585844039917,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 276.0,
"sampling/sampling_logp_difference/mean": 1.8168814182281494,
"step": 1181
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.3273409307003021,
"epoch": 3.110526315789474,
"grad_norm": 0.014104950241744518,
"learning_rate": 1e-06,
"loss": 0.1636,
"step": 1182
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.315226212143898,
"epoch": 3.113157894736842,
"grad_norm": 0.3530890643596649,
"learning_rate": 1e-06,
"loss": 0.1942,
"step": 1183
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.3299044966697693,
"epoch": 3.1157894736842104,
"grad_norm": 0.017564065754413605,
"learning_rate": 1e-06,
"loss": 0.1096,
"step": 1184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.068359375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10168.0,
"completions/mean_length": 1682.30859375,
"completions/mean_terminated_length": 603.568115234375,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"entropy": 0.32700584828853607,
"epoch": 3.1184210526315788,
"frac_reward_zero_std": 0.34375,
"grad_norm": 11.54803466796875,
"learning_rate": 1e-06,
"loss": 0.0785,
"num_tokens": 381744989.0,
"reward": 0.7641712427139282,
"reward_std": 0.18420130014419556,
"rewards/progression_diversity/mean": -0.023311495780944824,
"rewards/progression_diversity/std": 0.08777539432048798,
"rewards/symbolic_reward_accuracy/mean": 0.83203125,
"rewards/symbolic_reward_accuracy/std": 0.374204158782959,
"rewards/symbolic_reward_partial_score/mean": 0.90478515625,
"rewards/symbolic_reward_partial_score/std": 0.2593802213668823,
"rewards/tag_count_reward/mean": -0.0625,
"rewards/tag_count_reward/std": 0.2422981858253479,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0980756282806396,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 278.0,
"sampling/sampling_logp_difference/mean": 1.7065038681030273,
"step": 1185
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.3249793201684952,
"epoch": 3.1210526315789475,
"grad_norm": 242.987060546875,
"learning_rate": 1e-06,
"loss": 0.157,
"step": 1186
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.3353985846042633,
"epoch": 3.123684210526316,
"grad_norm": 0.011672238819301128,
"learning_rate": 1e-06,
"loss": 0.164,
"step": 1187
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.3257022500038147,
"epoch": 3.126315789473684,
"grad_norm": 0.03746514022350311,
"learning_rate": 1e-06,
"loss": 0.1344,
"step": 1188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2665.0,
"completions/mean_length": 2012.21484375,
"completions/mean_terminated_length": 593.540771484375,
"completions/min_length": 199.0,
"completions/min_terminated_length": 199.0,
"entropy": 0.326521098613739,
"epoch": 3.1289473684210525,
"frac_reward_zero_std": 0.28125,
"grad_norm": 233.8143310546875,
"learning_rate": 1e-06,
"loss": 0.1006,
"num_tokens": 383184043.0,
"reward": 0.7504065632820129,
"reward_std": 0.20918390154838562,
"rewards/progression_diversity/mean": -0.027706898748874664,
"rewards/progression_diversity/std": 0.09274362027645111,
"rewards/symbolic_reward_accuracy/mean": 0.822265625,
"rewards/symbolic_reward_accuracy/std": 0.3826628625392914,
"rewards/symbolic_reward_partial_score/mean": 0.8850911855697632,
"rewards/symbolic_reward_partial_score/std": 0.289438933134079,
"rewards/tag_count_reward/mean": -0.08203125,
"rewards/tag_count_reward/std": 0.2746807038784027,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1031694412231445,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 278.0,
"sampling/sampling_logp_difference/mean": 1.8653408288955688,
"step": 1189
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.32067760825157166,
"epoch": 3.1315789473684212,
"grad_norm": 49.3584098815918,
"learning_rate": 1e-06,
"loss": 0.1867,
"step": 1190
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.3191855549812317,
"epoch": 3.1342105263157896,
"grad_norm": 0.021937908604741096,
"learning_rate": 1e-06,
"loss": 0.2106,
"step": 1191
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.30871887505054474,
"epoch": 3.136842105263158,
"grad_norm": 0.017672132700681686,
"learning_rate": 1e-06,
"loss": 0.1299,
"step": 1192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7855.0,
"completions/mean_length": 2210.833984375,
"completions/mean_terminated_length": 608.6499633789062,
"completions/min_length": 217.0,
"completions/min_terminated_length": 217.0,
"entropy": 0.3056802898645401,
"epoch": 3.139473684210526,
"frac_reward_zero_std": 0.1875,
"grad_norm": 79.18196868896484,
"learning_rate": 1e-06,
"loss": 0.2021,
"num_tokens": 384727958.0,
"reward": 0.754828691482544,
"reward_std": 0.21231617033481598,
"rewards/progression_diversity/mean": -0.03471089154481888,
"rewards/progression_diversity/std": 0.10704705864191055,
"rewards/symbolic_reward_accuracy/mean": 0.83203125,
"rewards/symbolic_reward_accuracy/std": 0.374204158782959,
"rewards/symbolic_reward_partial_score/mean": 0.8811848759651184,
"rewards/symbolic_reward_partial_score/std": 0.29755792021751404,
"rewards/tag_count_reward/mean": -0.083984375,
"rewards/tag_count_reward/std": 0.2776356339454651,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1085774898529053,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 280.0,
"sampling/sampling_logp_difference/mean": 2.3273110389709473,
"step": 1193
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.3188657760620117,
"epoch": 3.1421052631578945,
"grad_norm": 0.007435488048940897,
"learning_rate": 1e-06,
"loss": 0.1936,
"step": 1194
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.31570450961589813,
"epoch": 3.1447368421052633,
"grad_norm": 0.71424400806427,
"learning_rate": 1e-06,
"loss": 0.1574,
"step": 1195
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.390625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.484375,
"entropy": 0.3103174865245819,
"epoch": 3.1473684210526316,
"grad_norm": 2.483917474746704,
"learning_rate": 1e-06,
"loss": 0.2062,
"step": 1196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11863.0,
"completions/mean_length": 2258.6640625,
"completions/mean_terminated_length": 593.2314453125,
"completions/min_length": 233.0,
"completions/min_terminated_length": 233.0,
"entropy": 0.3229822814464569,
"epoch": 3.15,
"frac_reward_zero_std": 0.25,
"grad_norm": 142.69839477539062,
"learning_rate": 1e-06,
"loss": 0.1636,
"num_tokens": 386288202.0,
"reward": 0.7562662363052368,
"reward_std": 0.2117644101381302,
"rewards/progression_diversity/mean": -0.032554611563682556,
"rewards/progression_diversity/std": 0.09795653074979782,
"rewards/symbolic_reward_accuracy/mean": 0.837890625,
"rewards/symbolic_reward_accuracy/std": 0.3689115643501282,
"rewards/symbolic_reward_partial_score/mean": 0.8787435293197632,
"rewards/symbolic_reward_partial_score/std": 0.3041481375694275,
"rewards/tag_count_reward/mean": -0.09765625,
"rewards/tag_count_reward/std": 0.29713961482048035,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.106873869895935,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 280.0,
"sampling/sampling_logp_difference/mean": 1.8862922191619873,
"step": 1197
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.31533288955688477,
"epoch": 3.1526315789473682,
"grad_norm": 63.04111099243164,
"learning_rate": 1e-06,
"loss": 0.1067,
"step": 1198
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.3111482411623001,
"epoch": 3.155263157894737,
"grad_norm": 0.010430481284856796,
"learning_rate": 1e-06,
"loss": 0.2744,
"step": 1199
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.32699713110923767,
"epoch": 3.1578947368421053,
"grad_norm": 79.19801330566406,
"learning_rate": 1e-06,
"loss": 0.1564,
"step": 1200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2453.0,
"completions/mean_length": 2007.53125,
"completions/mean_terminated_length": 588.3948364257812,
"completions/min_length": 220.0,
"completions/min_terminated_length": 220.0,
"entropy": 0.3115497827529907,
"epoch": 3.1605263157894736,
"frac_reward_zero_std": 0.28125,
"grad_norm": 73.29586029052734,
"learning_rate": 1e-06,
"loss": 0.1916,
"num_tokens": 387721850.0,
"reward": 0.7630906701087952,
"reward_std": 0.19893735647201538,
"rewards/progression_diversity/mean": -0.028826594352722168,
"rewards/progression_diversity/std": 0.09443072229623795,
"rewards/symbolic_reward_accuracy/mean": 0.8359375,
"rewards/symbolic_reward_accuracy/std": 0.37069445848464966,
"rewards/symbolic_reward_partial_score/mean": 0.8981119394302368,
"rewards/symbolic_reward_partial_score/std": 0.2708629369735718,
"rewards/tag_count_reward/mean": -0.076171875,
"rewards/tag_count_reward/std": 0.26553234457969666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.108184576034546,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 282.0,
"sampling/sampling_logp_difference/mean": 2.00418758392334,
"step": 1201
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.32552970945835114,
"epoch": 3.163157894736842,
"grad_norm": 6.941614627838135,
"learning_rate": 1e-06,
"loss": 0.1422,
"step": 1202
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.32488586008548737,
"epoch": 3.1657894736842107,
"grad_norm": 0.01468179002404213,
"learning_rate": 1e-06,
"loss": 0.1705,
"step": 1203
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.30805665254592896,
"epoch": 3.168421052631579,
"grad_norm": 0.013095368631184101,
"learning_rate": 1e-06,
"loss": 0.2168,
"step": 1204
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.072265625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9420.0,
"completions/mean_length": 1713.927734375,
"completions/mean_terminated_length": 571.206298828125,
"completions/min_length": 176.0,
"completions/min_terminated_length": 176.0,
"entropy": 0.3258505165576935,
"epoch": 3.1710526315789473,
"frac_reward_zero_std": 0.34375,
"grad_norm": 112.94711303710938,
"learning_rate": 1e-06,
"loss": 0.1194,
"num_tokens": 388982517.0,
"reward": 0.8024517297744751,
"reward_std": 0.18236470222473145,
"rewards/progression_diversity/mean": -0.023384764790534973,
"rewards/progression_diversity/std": 0.08616573363542557,
"rewards/symbolic_reward_accuracy/mean": 0.888671875,
"rewards/symbolic_reward_accuracy/std": 0.31484565138816833,
"rewards/symbolic_reward_partial_score/mean": 0.9152017831802368,
"rewards/symbolic_reward_partial_score/std": 0.25712403655052185,
"rewards/tag_count_reward/mean": -0.05078125,
"rewards/tag_count_reward/std": 0.21976542472839355,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1038358211517334,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 284.0,
"sampling/sampling_logp_difference/mean": 1.693922996520996,
"step": 1205
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.3253411054611206,
"epoch": 3.1736842105263157,
"grad_norm": 4.749903202056885,
"learning_rate": 1e-06,
"loss": 0.229,
"step": 1206
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.32736538350582123,
"epoch": 3.1763157894736844,
"grad_norm": 0.008272160775959492,
"learning_rate": 1e-06,
"loss": 0.1404,
"step": 1207
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.32286109030246735,
"epoch": 3.1789473684210527,
"grad_norm": 0.004888339899480343,
"learning_rate": 1e-06,
"loss": 0.1054,
"step": 1208
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15292.0,
"completions/mean_length": 1806.24609375,
"completions/mean_terminated_length": 570.8432006835938,
"completions/min_length": 227.0,
"completions/min_terminated_length": 227.0,
"entropy": 0.3235931098461151,
"epoch": 3.181578947368421,
"frac_reward_zero_std": 0.40625,
"grad_norm": 25.39488983154297,
"learning_rate": 1e-06,
"loss": 0.1452,
"num_tokens": 390291091.0,
"reward": 0.8004156351089478,
"reward_std": 0.1887449473142624,
"rewards/progression_diversity/mean": -0.026793645694851875,
"rewards/progression_diversity/std": 0.09325826168060303,
"rewards/symbolic_reward_accuracy/mean": 0.88671875,
"rewards/symbolic_reward_accuracy/std": 0.3172462284564972,
"rewards/symbolic_reward_partial_score/mean": 0.9176431894302368,
"rewards/symbolic_reward_partial_score/std": 0.25725844502449036,
"rewards/tag_count_reward/mean": -0.06640625,
"rewards/tag_count_reward/std": 0.2492343932390213,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1049944162368774,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 288.0,
"sampling/sampling_logp_difference/mean": 1.934594750404358,
"step": 1209
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.3248671740293503,
"epoch": 3.1842105263157894,
"grad_norm": 0.007971227169036865,
"learning_rate": 1e-06,
"loss": 0.1074,
"step": 1210
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.311784565448761,
"epoch": 3.1868421052631577,
"grad_norm": 15.894944190979004,
"learning_rate": 1e-06,
"loss": 0.188,
"step": 1211
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.3261888176202774,
"epoch": 3.1894736842105265,
"grad_norm": 0.019114594906568527,
"learning_rate": 1e-06,
"loss": 0.1449,
"step": 1212
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.072265625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9429.0,
"completions/mean_length": 1767.72265625,
"completions/mean_terminated_length": 629.1915283203125,
"completions/min_length": 250.0,
"completions/min_terminated_length": 250.0,
"entropy": 0.3183141201734543,
"epoch": 3.192105263157895,
"frac_reward_zero_std": 0.3125,
"grad_norm": 201.4045867919922,
"learning_rate": 1e-06,
"loss": 0.1038,
"num_tokens": 391601637.0,
"reward": 0.793647050857544,
"reward_std": 0.20239751040935516,
"rewards/progression_diversity/mean": -0.02494898810982704,
"rewards/progression_diversity/std": 0.08966317027807236,
"rewards/symbolic_reward_accuracy/mean": 0.875,
"rewards/symbolic_reward_accuracy/std": 0.3310423493385315,
"rewards/symbolic_reward_partial_score/mean": 0.9158528447151184,
"rewards/symbolic_reward_partial_score/std": 0.2540964186191559,
"rewards/tag_count_reward/mean": -0.05859375,
"rewards/tag_count_reward/std": 0.23509246110916138,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1079578399658203,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 290.0,
"sampling/sampling_logp_difference/mean": 2.028384208679199,
"step": 1213
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.31385740637779236,
"epoch": 3.194736842105263,
"grad_norm": 0.025811778381466866,
"learning_rate": 1e-06,
"loss": 0.1903,
"step": 1214
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.3017711192369461,
"epoch": 3.1973684210526314,
"grad_norm": 148.46176147460938,
"learning_rate": 1e-06,
"loss": 0.243,
"step": 1215
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.3152967840433121,
"epoch": 3.2,
"grad_norm": 6.499451160430908,
"learning_rate": 1e-06,
"loss": 0.0559,
"step": 1216
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12806.0,
"completions/mean_length": 2251.021484375,
"completions/mean_terminated_length": 653.3804321289062,
"completions/min_length": 218.0,
"completions/min_terminated_length": 218.0,
"entropy": 0.30623379349708557,
"epoch": 3.2026315789473685,
"frac_reward_zero_std": 0.28125,
"grad_norm": 91.87493133544922,
"learning_rate": 1e-06,
"loss": 0.1956,
"num_tokens": 393178320.0,
"reward": 0.7417978048324585,
"reward_std": 0.22114922106266022,
"rewards/progression_diversity/mean": -0.029206208884716034,
"rewards/progression_diversity/std": 0.09235595166683197,
"rewards/symbolic_reward_accuracy/mean": 0.81640625,
"rewards/symbolic_reward_accuracy/std": 0.3875311613082886,
"rewards/symbolic_reward_partial_score/mean": 0.8688150644302368,
"rewards/symbolic_reward_partial_score/std": 0.30946817994117737,
"rewards/tag_count_reward/mean": -0.083984375,
"rewards/tag_count_reward/std": 0.2776356339454651,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1069762706756592,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 292.0,
"sampling/sampling_logp_difference/mean": 1.9247076511383057,
"step": 1217
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.30731241405010223,
"epoch": 3.205263157894737,
"grad_norm": 6.715027809143066,
"learning_rate": 1e-06,
"loss": 0.1788,
"step": 1218
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.2882115989923477,
"epoch": 3.207894736842105,
"grad_norm": 0.01562909595668316,
"learning_rate": 1e-06,
"loss": 0.2044,
"step": 1219
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.30942070484161377,
"epoch": 3.2105263157894735,
"grad_norm": 0.34386348724365234,
"learning_rate": 1e-06,
"loss": 0.1043,
"step": 1220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.111328125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2114.0,
"completions/mean_length": 2317.568359375,
"completions/mean_terminated_length": 555.4000244140625,
"completions/min_length": 244.0,
"completions/min_terminated_length": 244.0,
"entropy": 0.3095013201236725,
"epoch": 3.213157894736842,
"frac_reward_zero_std": 0.15625,
"grad_norm": 89.25138092041016,
"learning_rate": 1e-06,
"loss": 0.134,
"num_tokens": 394758899.0,
"reward": 0.7222451567649841,
"reward_std": 0.2374163269996643,
"rewards/progression_diversity/mean": -0.03622689098119736,
"rewards/progression_diversity/std": 0.10711545497179031,
"rewards/symbolic_reward_accuracy/mean": 0.7890625,
"rewards/symbolic_reward_accuracy/std": 0.4083731174468994,
"rewards/symbolic_reward_partial_score/mean": 0.86572265625,
"rewards/symbolic_reward_partial_score/std": 0.30833473801612854,
"rewards/tag_count_reward/mean": -0.10546875,
"rewards/tag_count_reward/std": 0.3074568510055542,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.108048915863037,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 292.0,
"sampling/sampling_logp_difference/mean": 2.0354158878326416,
"step": 1221
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.30622752010822296,
"epoch": 3.2157894736842105,
"grad_norm": 0.01329483650624752,
"learning_rate": 1e-06,
"loss": 0.2269,
"step": 1222
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.2995607703924179,
"epoch": 3.218421052631579,
"grad_norm": 0.011663331650197506,
"learning_rate": 1e-06,
"loss": 0.1555,
"step": 1223
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.515625,
"entropy": 0.29136380553245544,
"epoch": 3.221052631578947,
"grad_norm": 0.02316325716674328,
"learning_rate": 1e-06,
"loss": 0.2399,
"step": 1224
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14746.0,
"completions/mean_length": 1491.333984375,
"completions/mean_terminated_length": 629.7747802734375,
"completions/min_length": 240.0,
"completions/min_terminated_length": 240.0,
"entropy": 0.31069743633270264,
"epoch": 3.223684210526316,
"frac_reward_zero_std": 0.3125,
"grad_norm": 37.16641616821289,
"learning_rate": 1e-06,
"loss": 0.1773,
"num_tokens": 395935454.0,
"reward": 0.8113774061203003,
"reward_std": 0.20781219005584717,
"rewards/progression_diversity/mean": -0.019492559134960175,
"rewards/progression_diversity/std": 0.07982630282640457,
"rewards/symbolic_reward_accuracy/mean": 0.900390625,
"rewards/symbolic_reward_accuracy/std": 0.29977133870124817,
"rewards/symbolic_reward_partial_score/mean": 0.9226887822151184,
"rewards/symbolic_reward_partial_score/std": 0.2527928054332733,
"rewards/tag_count_reward/mean": -0.0546875,
"rewards/tag_count_reward/std": 0.2275916188955307,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.096280813217163,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 292.0,
"sampling/sampling_logp_difference/mean": 1.8208186626434326,
"step": 1225
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.30570439994335175,
"epoch": 3.2263157894736842,
"grad_norm": 0.007866773754358292,
"learning_rate": 1e-06,
"loss": 0.215,
"step": 1226
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.32926367223262787,
"epoch": 3.2289473684210526,
"grad_norm": 0.04123647138476372,
"learning_rate": 1e-06,
"loss": 0.0619,
"step": 1227
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.32413050532341003,
"epoch": 3.231578947368421,
"grad_norm": 0.04225276783108711,
"learning_rate": 1e-06,
"loss": 0.1256,
"step": 1228
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10278.0,
"completions/mean_length": 1581.517578125,
"completions/mean_terminated_length": 594.6854248046875,
"completions/min_length": 244.0,
"completions/min_terminated_length": 244.0,
"entropy": 0.3033265024423599,
"epoch": 3.2342105263157896,
"frac_reward_zero_std": 0.34375,
"grad_norm": 168.14776611328125,
"learning_rate": 1e-06,
"loss": 0.2286,
"num_tokens": 397158023.0,
"reward": 0.8106723427772522,
"reward_std": 0.19477233290672302,
"rewards/progression_diversity/mean": -0.021638944745063782,
"rewards/progression_diversity/std": 0.08424000442028046,
"rewards/symbolic_reward_accuracy/mean": 0.900390625,
"rewards/symbolic_reward_accuracy/std": 0.29977133870124817,
"rewards/symbolic_reward_partial_score/mean": 0.9230143427848816,
"rewards/symbolic_reward_partial_score/std": 0.2554587125778198,
"rewards/tag_count_reward/mean": -0.0625,
"rewards/tag_count_reward/std": 0.2422981858253479,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0959186553955078,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 292.0,
"sampling/sampling_logp_difference/mean": 1.717806100845337,
"step": 1229
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.32844409346580505,
"epoch": 3.236842105263158,
"grad_norm": 10.585503578186035,
"learning_rate": 1e-06,
"loss": 0.1134,
"step": 1230
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.31832896173000336,
"epoch": 3.2394736842105263,
"grad_norm": 1.0383377075195312,
"learning_rate": 1e-06,
"loss": 0.1613,
"step": 1231
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.33217278122901917,
"epoch": 3.2421052631578946,
"grad_norm": 0.013041791506111622,
"learning_rate": 1e-06,
"loss": 0.0293,
"step": 1232
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.083984375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13950.0,
"completions/mean_length": 1890.353515625,
"completions/mean_terminated_length": 561.51171875,
"completions/min_length": 170.0,
"completions/min_terminated_length": 170.0,
"entropy": 0.30604319274425507,
"epoch": 3.2447368421052634,
"frac_reward_zero_std": 0.25,
"grad_norm": 29.466703414916992,
"learning_rate": 1e-06,
"loss": 0.1197,
"num_tokens": 398532860.0,
"reward": 0.7769619226455688,
"reward_std": 0.22913838922977448,
"rewards/progression_diversity/mean": -0.028422629460692406,
"rewards/progression_diversity/std": 0.09470929205417633,
"rewards/symbolic_reward_accuracy/mean": 0.85546875,
"rewards/symbolic_reward_accuracy/std": 0.35197147727012634,
"rewards/symbolic_reward_partial_score/mean": 0.9046223759651184,
"rewards/symbolic_reward_partial_score/std": 0.27007627487182617,
"rewards/tag_count_reward/mean": -0.07421875,
"rewards/tag_count_reward/std": 0.2623828947544098,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.101804256439209,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 294.0,
"sampling/sampling_logp_difference/mean": 1.9203028678894043,
"step": 1233
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.31168776750564575,
"epoch": 3.2473684210526317,
"grad_norm": 59.52106857299805,
"learning_rate": 1e-06,
"loss": 0.2153,
"step": 1234
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.3222711682319641,
"epoch": 3.25,
"grad_norm": 0.33637794852256775,
"learning_rate": 1e-06,
"loss": 0.0839,
"step": 1235
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.30836866796016693,
"epoch": 3.2526315789473683,
"grad_norm": 0.019555335864424706,
"learning_rate": 1e-06,
"loss": 0.1737,
"step": 1236
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.169921875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12113.0,
"completions/mean_length": 3351.146484375,
"completions/mean_terminated_length": 683.2446899414062,
"completions/min_length": 230.0,
"completions/min_terminated_length": 230.0,
"entropy": 0.2953759431838989,
"epoch": 3.2552631578947366,
"frac_reward_zero_std": 0.21875,
"grad_norm": 87.26698303222656,
"learning_rate": 1e-06,
"loss": 0.1332,
"num_tokens": 400674471.0,
"reward": 0.6438003182411194,
"reward_std": 0.23994354903697968,
"rewards/progression_diversity/mean": -0.05356261506676674,
"rewards/progression_diversity/std": 0.12248878180980682,
"rewards/symbolic_reward_accuracy/mean": 0.703125,
"rewards/symbolic_reward_accuracy/std": 0.45732781291007996,
"rewards/symbolic_reward_partial_score/mean": 0.7936197519302368,
"rewards/symbolic_reward_partial_score/std": 0.36749520897865295,
"rewards/tag_count_reward/mean": -0.15625,
"rewards/tag_count_reward/std": 0.36344730854034424,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1080725193023682,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 294.0,
"sampling/sampling_logp_difference/mean": 2.1226110458374023,
"step": 1237
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.46875,
"entropy": 0.30956192314624786,
"epoch": 3.2578947368421054,
"grad_norm": 9.49582290649414,
"learning_rate": 1e-06,
"loss": 0.1708,
"step": 1238
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.21875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5703125,
"entropy": 0.27571606636047363,
"epoch": 3.2605263157894737,
"grad_norm": 0.008884378708899021,
"learning_rate": 1e-06,
"loss": 0.3496,
"step": 1239
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.453125,
"entropy": 0.27126601338386536,
"epoch": 3.263157894736842,
"grad_norm": 18.252225875854492,
"learning_rate": 1e-06,
"loss": 0.2307,
"step": 1240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 1876.0,
"completions/mean_length": 1877.640625,
"completions/mean_terminated_length": 513.794921875,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"entropy": 0.32085326313972473,
"epoch": 3.2657894736842104,
"frac_reward_zero_std": 0.28125,
"grad_norm": 87.52373504638672,
"learning_rate": 1e-06,
"loss": 0.0879,
"num_tokens": 402027599.0,
"reward": 0.7780733108520508,
"reward_std": 0.20871524512767792,
"rewards/progression_diversity/mean": -0.029581986367702484,
"rewards/progression_diversity/std": 0.09899447858333588,
"rewards/symbolic_reward_accuracy/mean": 0.859375,
"rewards/symbolic_reward_accuracy/std": 0.3479743003845215,
"rewards/symbolic_reward_partial_score/mean": 0.90185546875,
"rewards/symbolic_reward_partial_score/std": 0.2770686745643616,
"rewards/tag_count_reward/mean": -0.078125,
"rewards/tag_count_reward/std": 0.26863065361976624,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0994126796722412,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 294.0,
"sampling/sampling_logp_difference/mean": 2.019242286682129,
"step": 1241
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.30657218396663666,
"epoch": 3.268421052631579,
"grad_norm": 0.007070607040077448,
"learning_rate": 1e-06,
"loss": 0.1977,
"step": 1242
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.30736126005649567,
"epoch": 3.2710526315789474,
"grad_norm": 0.016214022412896156,
"learning_rate": 1e-06,
"loss": 0.2739,
"step": 1243
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.32670439779758453,
"epoch": 3.2736842105263158,
"grad_norm": 0.010165052488446236,
"learning_rate": 1e-06,
"loss": 0.1227,
"step": 1244
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.119140625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12475.0,
"completions/mean_length": 2460.5546875,
"completions/mean_terminated_length": 577.3392333984375,
"completions/min_length": 245.0,
"completions/min_terminated_length": 245.0,
"entropy": 0.29750363528728485,
"epoch": 3.276315789473684,
"frac_reward_zero_std": 0.28125,
"grad_norm": 246.82859802246094,
"learning_rate": 1e-06,
"loss": 0.2205,
"num_tokens": 403684715.0,
"reward": 0.7260443568229675,
"reward_std": 0.22644254565238953,
"rewards/progression_diversity/mean": -0.03717077150940895,
"rewards/progression_diversity/std": 0.10484474152326584,
"rewards/symbolic_reward_accuracy/mean": 0.798828125,
"rewards/symbolic_reward_accuracy/std": 0.4012683033943176,
"rewards/symbolic_reward_partial_score/mean": 0.8575845956802368,
"rewards/symbolic_reward_partial_score/std": 0.3189500868320465,
"rewards/tag_count_reward/mean": -0.1015625,
"rewards/tag_count_reward/std": 0.30236753821372986,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1037794351577759,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 296.0,
"sampling/sampling_logp_difference/mean": 2.2273709774017334,
"step": 1245
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.31314218044281006,
"epoch": 3.2789473684210524,
"grad_norm": 4.193942070007324,
"learning_rate": 1e-06,
"loss": 0.1196,
"step": 1246
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.309044748544693,
"epoch": 3.281578947368421,
"grad_norm": 0.0659249871969223,
"learning_rate": 1e-06,
"loss": 0.1756,
"step": 1247
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.1953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.285678505897522,
"epoch": 3.2842105263157895,
"grad_norm": 0.006575907580554485,
"learning_rate": 1e-06,
"loss": 0.2906,
"step": 1248
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.099609375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2405.0,
"completions/mean_length": 2114.935546875,
"completions/mean_terminated_length": 536.3622436523438,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"entropy": 0.30202820897102356,
"epoch": 3.286842105263158,
"frac_reward_zero_std": 0.25,
"grad_norm": 161.16702270507812,
"learning_rate": 1e-06,
"loss": 0.2409,
"num_tokens": 405162026.0,
"reward": 0.7425325512886047,
"reward_std": 0.2102564126253128,
"rewards/progression_diversity/mean": -0.028976155444979668,
"rewards/progression_diversity/std": 0.09275542944669724,
"rewards/symbolic_reward_accuracy/mean": 0.814453125,
"rewards/symbolic_reward_accuracy/std": 0.38912075757980347,
"rewards/symbolic_reward_partial_score/mean": 0.87451171875,
"rewards/symbolic_reward_partial_score/std": 0.3029172122478485,
"rewards/tag_count_reward/mean": -0.08203125,
"rewards/tag_count_reward/std": 0.2746807038784027,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1059821844100952,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 296.0,
"sampling/sampling_logp_difference/mean": 2.2559051513671875,
"step": 1249
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.32025621831417084,
"epoch": 3.2894736842105265,
"grad_norm": 0.03536779060959816,
"learning_rate": 1e-06,
"loss": 0.1425,
"step": 1250
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.29748618602752686,
"epoch": 3.292105263157895,
"grad_norm": 1.4454317092895508,
"learning_rate": 1e-06,
"loss": 0.2194,
"step": 1251
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.3038511127233505,
"epoch": 3.294736842105263,
"grad_norm": 0.013857961632311344,
"learning_rate": 1e-06,
"loss": 0.1161,
"step": 1252
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10572.0,
"completions/mean_length": 2446.869140625,
"completions/mean_terminated_length": 596.8074951171875,
"completions/min_length": 199.0,
"completions/min_terminated_length": 199.0,
"entropy": 0.3028118759393692,
"epoch": 3.2973684210526315,
"frac_reward_zero_std": 0.125,
"grad_norm": 439.4346008300781,
"learning_rate": 1e-06,
"loss": 0.2289,
"num_tokens": 406818631.0,
"reward": 0.7258546948432922,
"reward_std": 0.24171996116638184,
"rewards/progression_diversity/mean": -0.04148336872458458,
"rewards/progression_diversity/std": 0.1141848936676979,
"rewards/symbolic_reward_accuracy/mean": 0.80078125,
"rewards/symbolic_reward_accuracy/std": 0.39980348944664,
"rewards/symbolic_reward_partial_score/mean": 0.8564453125,
"rewards/symbolic_reward_partial_score/std": 0.33105501532554626,
"rewards/tag_count_reward/mean": -0.111328125,
"rewards/tag_count_reward/std": 0.31484565138816833,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1173810958862305,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 296.0,
"sampling/sampling_logp_difference/mean": 3.1980838775634766,
"step": 1253
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.3106544762849808,
"epoch": 3.3,
"grad_norm": 0.009494182653725147,
"learning_rate": 1e-06,
"loss": 0.1479,
"step": 1254
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4765625,
"entropy": 0.2966717481613159,
"epoch": 3.3026315789473686,
"grad_norm": 3.8081188201904297,
"learning_rate": 1e-06,
"loss": 0.2924,
"step": 1255
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.29619279503822327,
"epoch": 3.305263157894737,
"grad_norm": 0.04518857225775719,
"learning_rate": 1e-06,
"loss": 0.31,
"step": 1256
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13429.0,
"completions/mean_length": 2205.7890625,
"completions/mean_terminated_length": 534.1222534179688,
"completions/min_length": 230.0,
"completions/min_terminated_length": 230.0,
"entropy": 0.3126777410507202,
"epoch": 3.307894736842105,
"frac_reward_zero_std": 0.15625,
"grad_norm": 185.87486267089844,
"learning_rate": 1e-06,
"loss": 0.1568,
"num_tokens": 408320763.0,
"reward": 0.7695430517196655,
"reward_std": 0.2663387060165405,
"rewards/progression_diversity/mean": -0.03788202255964279,
"rewards/progression_diversity/std": 0.1108899936079979,
"rewards/symbolic_reward_accuracy/mean": 0.85546875,
"rewards/symbolic_reward_accuracy/std": 0.35197147727012634,
"rewards/symbolic_reward_partial_score/mean": 0.8873697519302368,
"rewards/symbolic_reward_partial_score/std": 0.2990962564945221,
"rewards/tag_count_reward/mean": -0.095703125,
"rewards/tag_count_reward/std": 0.2944713830947876,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1089106798171997,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 296.0,
"sampling/sampling_logp_difference/mean": 2.924543857574463,
"step": 1257
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4609375,
"entropy": 0.2874736040830612,
"epoch": 3.3105263157894735,
"grad_norm": 0.3372192084789276,
"learning_rate": 1e-06,
"loss": 0.3459,
"step": 1258
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.3138102889060974,
"epoch": 3.3131578947368423,
"grad_norm": 0.008773678913712502,
"learning_rate": 1e-06,
"loss": 0.116,
"step": 1259
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4765625,
"entropy": 0.28731557726860046,
"epoch": 3.3157894736842106,
"grad_norm": 0.016135241836309433,
"learning_rate": 1e-06,
"loss": 0.2797,
"step": 1260
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1328125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8639.0,
"completions/mean_length": 2694.34375,
"completions/mean_terminated_length": 597.729736328125,
"completions/min_length": 207.0,
"completions/min_terminated_length": 207.0,
"entropy": 0.2984755337238312,
"epoch": 3.318421052631579,
"frac_reward_zero_std": 0.34375,
"grad_norm": 48.49712371826172,
"learning_rate": 1e-06,
"loss": 0.1345,
"num_tokens": 410128779.0,
"reward": 0.7372407913208008,
"reward_std": 0.21289455890655518,
"rewards/progression_diversity/mean": -0.04057057946920395,
"rewards/progression_diversity/std": 0.10670918226242065,
"rewards/symbolic_reward_accuracy/mean": 0.8203125,
"rewards/symbolic_reward_accuracy/std": 0.38430243730545044,
"rewards/symbolic_reward_partial_score/mean": 0.85986328125,
"rewards/symbolic_reward_partial_score/std": 0.32551804184913635,
"rewards/tag_count_reward/mean": -0.125,
"rewards/tag_count_reward/std": 0.3310423493385315,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.096588373184204,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 296.0,
"sampling/sampling_logp_difference/mean": 2.3006858825683594,
"step": 1261
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.203125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.2700554430484772,
"epoch": 3.3210526315789473,
"grad_norm": 0.009636012837290764,
"learning_rate": 1e-06,
"loss": 0.3738,
"step": 1262
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.30288518965244293,
"epoch": 3.3236842105263156,
"grad_norm": 0.275381863117218,
"learning_rate": 1e-06,
"loss": 0.1203,
"step": 1263
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.3005267381668091,
"epoch": 3.3263157894736843,
"grad_norm": 0.15126138925552368,
"learning_rate": 1e-06,
"loss": 0.1354,
"step": 1264
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1328125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15602.0,
"completions/mean_length": 2717.74609375,
"completions/mean_terminated_length": 624.7162475585938,
"completions/min_length": 221.0,
"completions/min_terminated_length": 221.0,
"entropy": 0.29306742548942566,
"epoch": 3.3289473684210527,
"frac_reward_zero_std": 0.125,
"grad_norm": 84.46516418457031,
"learning_rate": 1e-06,
"loss": 0.2436,
"num_tokens": 411934761.0,
"reward": 0.6926791071891785,
"reward_std": 0.26792874932289124,
"rewards/progression_diversity/mean": -0.03872973471879959,
"rewards/progression_diversity/std": 0.10431662201881409,
"rewards/symbolic_reward_accuracy/mean": 0.755859375,
"rewards/symbolic_reward_accuracy/std": 0.42999663949012756,
"rewards/symbolic_reward_partial_score/mean": 0.8388671875,
"rewards/symbolic_reward_partial_score/std": 0.327902227640152,
"rewards/tag_count_reward/mean": -0.12109375,
"rewards/tag_count_reward/std": 0.3265552520751953,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.09934663772583,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 298.0,
"sampling/sampling_logp_difference/mean": 3.240568161010742,
"step": 1265
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4921875,
"entropy": 0.28837116062641144,
"epoch": 3.331578947368421,
"grad_norm": 42.86471939086914,
"learning_rate": 1e-06,
"loss": 0.2322,
"step": 1266
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3671875,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5,
"entropy": 0.3078329712152481,
"epoch": 3.3342105263157893,
"grad_norm": 0.009662003256380558,
"learning_rate": 1e-06,
"loss": 0.1655,
"step": 1267
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3984375,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5703125,
"entropy": 0.2638581395149231,
"epoch": 3.336842105263158,
"grad_norm": 0.007266658823937178,
"learning_rate": 1e-06,
"loss": 0.2735,
"step": 1268
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.103515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2300.0,
"completions/mean_length": 2193.158203125,
"completions/mean_terminated_length": 554.5642700195312,
"completions/min_length": 214.0,
"completions/min_terminated_length": 214.0,
"entropy": 0.3069176971912384,
"epoch": 3.3394736842105264,
"frac_reward_zero_std": 0.28125,
"grad_norm": 140.47747802734375,
"learning_rate": 1e-06,
"loss": 0.1502,
"num_tokens": 413436602.0,
"reward": 0.7662738561630249,
"reward_std": 0.20352765917778015,
"rewards/progression_diversity/mean": -0.032773204147815704,
"rewards/progression_diversity/std": 0.09917779266834259,
"rewards/symbolic_reward_accuracy/mean": 0.84375,
"rewards/symbolic_reward_accuracy/std": 0.36344730854034424,
"rewards/symbolic_reward_partial_score/mean": 0.9016926884651184,
"rewards/symbolic_reward_partial_score/std": 0.27043241262435913,
"rewards/tag_count_reward/mean": -0.1015625,
"rewards/tag_count_reward/std": 0.30236753821372986,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.093727946281433,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 298.0,
"sampling/sampling_logp_difference/mean": 3.0766854286193848,
"step": 1269
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.32548436522483826,
"epoch": 3.3421052631578947,
"grad_norm": 0.011535858735442162,
"learning_rate": 1e-06,
"loss": 0.1011,
"step": 1270
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.2972973585128784,
"epoch": 3.344736842105263,
"grad_norm": 0.00575890950858593,
"learning_rate": 1e-06,
"loss": 0.2421,
"step": 1271
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.2989150583744049,
"epoch": 3.3473684210526318,
"grad_norm": 0.008553121238946915,
"learning_rate": 1e-06,
"loss": 0.2467,
"step": 1272
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.087890625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 1972.0,
"completions/mean_length": 1971.931640625,
"completions/mean_terminated_length": 583.1884155273438,
"completions/min_length": 205.0,
"completions/min_terminated_length": 205.0,
"entropy": 0.31215590238571167,
"epoch": 3.35,
"frac_reward_zero_std": 0.21875,
"grad_norm": 132.39788818359375,
"learning_rate": 1e-06,
"loss": 0.1477,
"num_tokens": 414865399.0,
"reward": 0.7922816276550293,
"reward_std": 0.2368476241827011,
"rewards/progression_diversity/mean": -0.029654610902071,
"rewards/progression_diversity/std": 0.09719457477331161,
"rewards/symbolic_reward_accuracy/mean": 0.880859375,
"rewards/symbolic_reward_accuracy/std": 0.32427072525024414,
"rewards/symbolic_reward_partial_score/mean": 0.904296875,
"rewards/symbolic_reward_partial_score/std": 0.28047847747802734,
"rewards/tag_count_reward/mean": -0.072265625,
"rewards/tag_count_reward/std": 0.2591804563999176,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0867669582366943,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 300.0,
"sampling/sampling_logp_difference/mean": 3.5754165649414062,
"step": 1273
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.3091353476047516,
"epoch": 3.3526315789473684,
"grad_norm": 0.010651475749909878,
"learning_rate": 1e-06,
"loss": 0.1685,
"step": 1274
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.30523787438869476,
"epoch": 3.3552631578947367,
"grad_norm": 0.008251101709902287,
"learning_rate": 1e-06,
"loss": 0.2036,
"step": 1275
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3984375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.515625,
"entropy": 0.29036298394203186,
"epoch": 3.3578947368421055,
"grad_norm": 0.012122713960707188,
"learning_rate": 1e-06,
"loss": 0.2905,
"step": 1276
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.083984375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 1526.0,
"completions/mean_length": 1901.666015625,
"completions/mean_terminated_length": 573.8614501953125,
"completions/min_length": 212.0,
"completions/min_terminated_length": 212.0,
"entropy": 0.30242083966732025,
"epoch": 3.360526315789474,
"frac_reward_zero_std": 0.25,
"grad_norm": 185.1011199951172,
"learning_rate": 1e-06,
"loss": 0.2087,
"num_tokens": 416246188.0,
"reward": 0.7942166328430176,
"reward_std": 0.22256335616111755,
"rewards/progression_diversity/mean": -0.026581626385450363,
"rewards/progression_diversity/std": 0.0894814282655716,
"rewards/symbolic_reward_accuracy/mean": 0.87890625,
"rewards/symbolic_reward_accuracy/std": 0.3265552520751953,
"rewards/symbolic_reward_partial_score/mean": 0.9178059697151184,
"rewards/symbolic_reward_partial_score/std": 0.25296953320503235,
"rewards/tag_count_reward/mean": -0.08203125,
"rewards/tag_count_reward/std": 0.2746807038784027,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0797239542007446,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 300.0,
"sampling/sampling_logp_difference/mean": 3.349595069885254,
"step": 1277
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.30199746787548065,
"epoch": 3.363157894736842,
"grad_norm": 0.02538270130753517,
"learning_rate": 1e-06,
"loss": 0.2414,
"step": 1278
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.3231187015771866,
"epoch": 3.3657894736842104,
"grad_norm": 0.012012549676001072,
"learning_rate": 1e-06,
"loss": 0.1036,
"step": 1279
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.3105398118495941,
"epoch": 3.3684210526315788,
"grad_norm": 0.013158765621483326,
"learning_rate": 1e-06,
"loss": 0.1455,
"step": 1280
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.111328125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2128.0,
"completions/mean_length": 2354.533203125,
"completions/mean_terminated_length": 596.99560546875,
"completions/min_length": 188.0,
"completions/min_terminated_length": 188.0,
"entropy": 0.30621521174907684,
"epoch": 3.3710526315789475,
"frac_reward_zero_std": 0.25,
"grad_norm": 140.82589721679688,
"learning_rate": 1e-06,
"loss": 0.1302,
"num_tokens": 417859869.0,
"reward": 0.7398780584335327,
"reward_std": 0.2521520256996155,
"rewards/progression_diversity/mean": -0.0356326699256897,
"rewards/progression_diversity/std": 0.10271129757165909,
"rewards/symbolic_reward_accuracy/mean": 0.81640625,
"rewards/symbolic_reward_accuracy/std": 0.3875311613082886,
"rewards/symbolic_reward_partial_score/mean": 0.8697916269302368,
"rewards/symbolic_reward_partial_score/std": 0.3091786503791809,
"rewards/tag_count_reward/mean": -0.10546875,
"rewards/tag_count_reward/std": 0.3074568510055542,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0742948055267334,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 302.0,
"sampling/sampling_logp_difference/mean": 3.784925699234009,
"step": 1281
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.29408055543899536,
"epoch": 3.373684210526316,
"grad_norm": 0.010642572306096554,
"learning_rate": 1e-06,
"loss": 0.1868,
"step": 1282
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.29353274405002594,
"epoch": 3.376315789473684,
"grad_norm": 0.01788030005991459,
"learning_rate": 1e-06,
"loss": 0.1809,
"step": 1283
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.2797112911939621,
"epoch": 3.3789473684210525,
"grad_norm": 0.006915436126291752,
"learning_rate": 1e-06,
"loss": 0.2527,
"step": 1284
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16124.0,
"completions/mean_length": 1972.298828125,
"completions/mean_terminated_length": 617.3526000976562,
"completions/min_length": 201.0,
"completions/min_terminated_length": 201.0,
"entropy": 0.33247339725494385,
"epoch": 3.3815789473684212,
"frac_reward_zero_std": 0.25,
"grad_norm": 348.1959228515625,
"learning_rate": 1e-06,
"loss": 0.0597,
"num_tokens": 419281174.0,
"reward": 0.793836772441864,
"reward_std": 0.20707328617572784,
"rewards/progression_diversity/mean": -0.030389215797185898,
"rewards/progression_diversity/std": 0.09979861974716187,
"rewards/symbolic_reward_accuracy/mean": 0.87890625,
"rewards/symbolic_reward_accuracy/std": 0.3265552520751953,
"rewards/symbolic_reward_partial_score/mean": 0.9153646230697632,
"rewards/symbolic_reward_partial_score/std": 0.25428155064582825,
"rewards/tag_count_reward/mean": -0.078125,
"rewards/tag_count_reward/std": 0.26863065361976624,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.062885046005249,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 304.0,
"sampling/sampling_logp_difference/mean": 3.6513113975524902,
"step": 1285
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.3000902533531189,
"epoch": 3.3842105263157896,
"grad_norm": 0.006365468725562096,
"learning_rate": 1e-06,
"loss": 0.1583,
"step": 1286
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.2907492071390152,
"epoch": 3.386842105263158,
"grad_norm": 0.013992332853376865,
"learning_rate": 1e-06,
"loss": 0.2211,
"step": 1287
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.2959248870611191,
"epoch": 3.389473684210526,
"grad_norm": 0.006391443312168121,
"learning_rate": 1e-06,
"loss": 0.2083,
"step": 1288
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9612.0,
"completions/mean_length": 2112.052734375,
"completions/mean_terminated_length": 635.6444091796875,
"completions/min_length": 228.0,
"completions/min_terminated_length": 228.0,
"entropy": 0.3044814467430115,
"epoch": 3.3921052631578945,
"frac_reward_zero_std": 0.15625,
"grad_norm": 314.1883239746094,
"learning_rate": 1e-06,
"loss": 0.1179,
"num_tokens": 420783377.0,
"reward": 0.77393639087677,
"reward_std": 0.25700780749320984,
"rewards/progression_diversity/mean": -0.03312736004590988,
"rewards/progression_diversity/std": 0.10419656336307526,
"rewards/symbolic_reward_accuracy/mean": 0.857421875,
"rewards/symbolic_reward_accuracy/std": 0.3499840497970581,
"rewards/symbolic_reward_partial_score/mean": 0.8946939706802368,
"rewards/symbolic_reward_partial_score/std": 0.2826669216156006,
"rewards/tag_count_reward/mean": -0.0859375,
"rewards/tag_count_reward/std": 0.28054583072662354,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0506939888000488,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 304.0,
"sampling/sampling_logp_difference/mean": 5.086845397949219,
"step": 1289
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.27778656780719757,
"epoch": 3.3947368421052633,
"grad_norm": 0.013014139607548714,
"learning_rate": 1e-06,
"loss": 0.2738,
"step": 1290
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.2996399998664856,
"epoch": 3.3973684210526316,
"grad_norm": 0.06032712757587433,
"learning_rate": 1e-06,
"loss": 0.2089,
"step": 1291
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.30588841438293457,
"epoch": 3.4,
"grad_norm": 0.018283555284142494,
"learning_rate": 1e-06,
"loss": 0.1612,
"step": 1292
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10146.0,
"completions/mean_length": 2171.923828125,
"completions/mean_terminated_length": 565.34130859375,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"entropy": 0.3020727336406708,
"epoch": 3.4026315789473682,
"frac_reward_zero_std": 0.25,
"grad_norm": 125.83609008789062,
"learning_rate": 1e-06,
"loss": 0.0761,
"num_tokens": 422257322.0,
"reward": 0.7886345386505127,
"reward_std": 0.19221815466880798,
"rewards/progression_diversity/mean": -0.03791151940822601,
"rewards/progression_diversity/std": 0.11339190602302551,
"rewards/symbolic_reward_accuracy/mean": 0.87109375,
"rewards/symbolic_reward_accuracy/std": 0.33542385697364807,
"rewards/symbolic_reward_partial_score/mean": 0.9158528447151184,
"rewards/symbolic_reward_partial_score/std": 0.25923240184783936,
"rewards/tag_count_reward/mean": -0.083984375,
"rewards/tag_count_reward/std": 0.2776356339454651,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0494663715362549,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 304.0,
"sampling/sampling_logp_difference/mean": 4.9170942306518555,
"step": 1293
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.29434891045093536,
"epoch": 3.405263157894737,
"grad_norm": 0.01162765920162201,
"learning_rate": 1e-06,
"loss": 0.1834,
"step": 1294
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.2950708270072937,
"epoch": 3.4078947368421053,
"grad_norm": 0.009900042787194252,
"learning_rate": 1e-06,
"loss": 0.2169,
"step": 1295
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.30467459559440613,
"epoch": 3.4105263157894736,
"grad_norm": 0.009985024109482765,
"learning_rate": 1e-06,
"loss": 0.1849,
"step": 1296
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15928.0,
"completions/mean_length": 3336.40625,
"completions/mean_terminated_length": 628.4151000976562,
"completions/min_length": 266.0,
"completions/min_terminated_length": 266.0,
"entropy": 0.2799158841371536,
"epoch": 3.413157894736842,
"frac_reward_zero_std": 0.125,
"grad_norm": 283.4808654785156,
"learning_rate": 1e-06,
"loss": 0.166,
"num_tokens": 424360346.0,
"reward": 0.7545630931854248,
"reward_std": 0.22717466950416565,
"rewards/progression_diversity/mean": -0.06615223735570908,
"rewards/progression_diversity/std": 0.14478321373462677,
"rewards/symbolic_reward_accuracy/mean": 0.83984375,
"rewards/symbolic_reward_accuracy/std": 0.3671095669269562,
"rewards/symbolic_reward_partial_score/mean": 0.87744140625,
"rewards/symbolic_reward_partial_score/std": 0.3093447685241699,
"rewards/tag_count_reward/mean": -0.119140625,
"rewards/tag_count_reward/std": 0.32427072525024414,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0230858325958252,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 304.0,
"sampling/sampling_logp_difference/mean": 7.494105815887451,
"step": 1297
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.23919285833835602,
"epoch": 3.4157894736842107,
"grad_norm": 1.23762845993042,
"learning_rate": 1e-06,
"loss": 0.371,
"step": 1298
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.46875,
"entropy": 0.25987882912158966,
"epoch": 3.418421052631579,
"grad_norm": 0.5271911025047302,
"learning_rate": 1e-06,
"loss": 0.2374,
"step": 1299
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3671875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4765625,
"entropy": 0.26133809983730316,
"epoch": 3.4210526315789473,
"grad_norm": 0.11925064027309418,
"learning_rate": 1e-06,
"loss": 0.188,
"step": 1300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10431.0,
"completions/mean_length": 3059.662109375,
"completions/mean_terminated_length": 592.192138671875,
"completions/min_length": 235.0,
"completions/min_terminated_length": 235.0,
"entropy": 0.2862105369567871,
"epoch": 3.4236842105263157,
"frac_reward_zero_std": 0.09375,
"grad_norm": 341.6363830566406,
"learning_rate": 1e-06,
"loss": 0.195,
"num_tokens": 426325869.0,
"reward": 0.7616207599639893,
"reward_std": 0.24166487157344818,
"rewards/progression_diversity/mean": -0.05862627178430557,
"rewards/progression_diversity/std": 0.13662515580654144,
"rewards/symbolic_reward_accuracy/mean": 0.845703125,
"rewards/symbolic_reward_accuracy/std": 0.36158639192581177,
"rewards/symbolic_reward_partial_score/mean": 0.8831380009651184,
"rewards/symbolic_reward_partial_score/std": 0.301278293132782,
"rewards/tag_count_reward/mean": -0.1015625,
"rewards/tag_count_reward/std": 0.30236753821372986,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0274146795272827,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 304.0,
"sampling/sampling_logp_difference/mean": 7.100645542144775,
"step": 1301
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3671875,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5078125,
"entropy": 0.27012303471565247,
"epoch": 3.4263157894736844,
"grad_norm": 3.0215957164764404,
"learning_rate": 1e-06,
"loss": 0.2341,
"step": 1302
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.375,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5,
"entropy": 0.26916660368442535,
"epoch": 3.4289473684210527,
"grad_norm": 37.49262619018555,
"learning_rate": 1e-06,
"loss": 0.1515,
"step": 1303
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3984375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.53125,
"entropy": 0.27728109061717987,
"epoch": 3.431578947368421,
"grad_norm": 6.821070194244385,
"learning_rate": 1e-06,
"loss": 0.2638,
"step": 1304
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15417.0,
"completions/mean_length": 3132.240234375,
"completions/mean_terminated_length": 678.2106323242188,
"completions/min_length": 227.0,
"completions/min_terminated_length": 227.0,
"entropy": 0.2735847979784012,
"epoch": 3.4342105263157894,
"frac_reward_zero_std": 0.125,
"grad_norm": 166.9344940185547,
"learning_rate": 1e-06,
"loss": 0.2523,
"num_tokens": 428342568.0,
"reward": 0.7301239967346191,
"reward_std": 0.27412816882133484,
"rewards/progression_diversity/mean": -0.05889313295483589,
"rewards/progression_diversity/std": 0.136581152677536,
"rewards/symbolic_reward_accuracy/mean": 0.806640625,
"rewards/symbolic_reward_accuracy/std": 0.39531853795051575,
"rewards/symbolic_reward_partial_score/mean": 0.8640950918197632,
"rewards/symbolic_reward_partial_score/std": 0.31252816319465637,
"rewards/tag_count_reward/mean": -0.125,
"rewards/tag_count_reward/std": 0.3310423493385315,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0343992710113525,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 306.0,
"sampling/sampling_logp_difference/mean": 6.35936164855957,
"step": 1305
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.264299601316452,
"epoch": 3.4368421052631577,
"grad_norm": 14.73751449584961,
"learning_rate": 1e-06,
"loss": 0.1532,
"step": 1306
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.27560053765773773,
"epoch": 3.4394736842105265,
"grad_norm": 2.936858892440796,
"learning_rate": 1e-06,
"loss": 0.2459,
"step": 1307
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5078125,
"entropy": 0.2791202515363693,
"epoch": 3.442105263157895,
"grad_norm": 0.5196275115013123,
"learning_rate": 1e-06,
"loss": 0.2228,
"step": 1308
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.130859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2387.0,
"completions/mean_length": 2650.87890625,
"completions/mean_terminated_length": 583.1954956054688,
"completions/min_length": 235.0,
"completions/min_terminated_length": 235.0,
"entropy": 0.30482953786849976,
"epoch": 3.444736842105263,
"frac_reward_zero_std": 0.03125,
"grad_norm": 386.9339294433594,
"learning_rate": 1e-06,
"loss": 0.1641,
"num_tokens": 430092778.0,
"reward": 0.7358328104019165,
"reward_std": 0.2448652684688568,
"rewards/progression_diversity/mean": -0.04465062916278839,
"rewards/progression_diversity/std": 0.11796204745769501,
"rewards/symbolic_reward_accuracy/mean": 0.806640625,
"rewards/symbolic_reward_accuracy/std": 0.39531853795051575,
"rewards/symbolic_reward_partial_score/mean": 0.87548828125,
"rewards/symbolic_reward_partial_score/std": 0.299261212348938,
"rewards/tag_count_reward/mean": -0.103515625,
"rewards/tag_count_reward/std": 0.30492907762527466,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0414533615112305,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 304.0,
"sampling/sampling_logp_difference/mean": 5.4577789306640625,
"step": 1309
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.359375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4921875,
"entropy": 0.28392690420150757,
"epoch": 3.4473684210526314,
"grad_norm": 160.2227020263672,
"learning_rate": 1e-06,
"loss": 0.2389,
"step": 1310
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.3085232526063919,
"epoch": 3.45,
"grad_norm": 0.17238354682922363,
"learning_rate": 1e-06,
"loss": 0.1332,
"step": 1311
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3984375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.53125,
"entropy": 0.2967042475938797,
"epoch": 3.4526315789473685,
"grad_norm": 7.176582336425781,
"learning_rate": 1e-06,
"loss": 0.1358,
"step": 1312
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.103515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9963.0,
"completions/mean_length": 2258.275390625,
"completions/mean_terminated_length": 627.200439453125,
"completions/min_length": 217.0,
"completions/min_terminated_length": 217.0,
"entropy": 0.31084753572940826,
"epoch": 3.455263157894737,
"frac_reward_zero_std": 0.25,
"grad_norm": 914.4140625,
"learning_rate": 1e-06,
"loss": 0.1569,
"num_tokens": 431662167.0,
"reward": 0.7634491920471191,
"reward_std": 0.23263157904148102,
"rewards/progression_diversity/mean": -0.03691425174474716,
"rewards/progression_diversity/std": 0.10997199267148972,
"rewards/symbolic_reward_accuracy/mean": 0.841796875,
"rewards/symbolic_reward_accuracy/std": 0.36528825759887695,
"rewards/symbolic_reward_partial_score/mean": 0.8917642831802368,
"rewards/symbolic_reward_partial_score/std": 0.28572434186935425,
"rewards/tag_count_reward/mean": -0.087890625,
"rewards/tag_count_reward/std": 0.2834126651287079,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0521239042282104,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 308.0,
"sampling/sampling_logp_difference/mean": 4.490725994110107,
"step": 1313
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.3135823905467987,
"epoch": 3.457894736842105,
"grad_norm": 48.58332824707031,
"learning_rate": 1e-06,
"loss": 0.1558,
"step": 1314
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.359375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.32143357396125793,
"epoch": 3.4605263157894735,
"grad_norm": 0.030546952039003372,
"learning_rate": 1e-06,
"loss": 0.1197,
"step": 1315
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.29036542773246765,
"epoch": 3.463157894736842,
"grad_norm": 0.008445881307125092,
"learning_rate": 1e-06,
"loss": 0.2007,
"step": 1316
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2140.0,
"completions/mean_length": 2460.689453125,
"completions/mean_terminated_length": 612.46240234375,
"completions/min_length": 216.0,
"completions/min_terminated_length": 216.0,
"entropy": 0.29718485474586487,
"epoch": 3.4657894736842105,
"frac_reward_zero_std": 0.15625,
"grad_norm": 533.7861938476562,
"learning_rate": 1e-06,
"loss": 0.1662,
"num_tokens": 433313176.0,
"reward": 0.7628533840179443,
"reward_std": 0.2578679919242859,
"rewards/progression_diversity/mean": -0.04279157519340515,
"rewards/progression_diversity/std": 0.11989957839250565,
"rewards/symbolic_reward_accuracy/mean": 0.84375,
"rewards/symbolic_reward_accuracy/std": 0.36344730854034424,
"rewards/symbolic_reward_partial_score/mean": 0.8880208730697632,
"rewards/symbolic_reward_partial_score/std": 0.28819218277931213,
"rewards/tag_count_reward/mean": -0.09375,
"rewards/tag_count_reward/std": 0.29176566004753113,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0420756340026855,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 308.0,
"sampling/sampling_logp_difference/mean": 5.4957427978515625,
"step": 1317
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.3101320266723633,
"epoch": 3.468421052631579,
"grad_norm": 0.01260958332568407,
"learning_rate": 1e-06,
"loss": 0.1788,
"step": 1318
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.27350659668445587,
"epoch": 3.4710526315789476,
"grad_norm": 3.119270086288452,
"learning_rate": 1e-06,
"loss": 0.3215,
"step": 1319
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.40625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.484375,
"entropy": 0.3225550502538681,
"epoch": 3.473684210526316,
"grad_norm": 2.4085659980773926,
"learning_rate": 1e-06,
"loss": 0.1297,
"step": 1320
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.099609375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16143.0,
"completions/mean_length": 2217.62109375,
"completions/mean_terminated_length": 650.4078369140625,
"completions/min_length": 238.0,
"completions/min_terminated_length": 238.0,
"entropy": 0.3091781437397003,
"epoch": 3.4763157894736842,
"frac_reward_zero_std": 0.25,
"grad_norm": 291.2024230957031,
"learning_rate": 1e-06,
"loss": 0.1649,
"num_tokens": 434844726.0,
"reward": 0.7853677272796631,
"reward_std": 0.206242173910141,
"rewards/progression_diversity/mean": -0.03744862973690033,
"rewards/progression_diversity/std": 0.11320951581001282,
"rewards/symbolic_reward_accuracy/mean": 0.87109375,
"rewards/symbolic_reward_accuracy/std": 0.33542385697364807,
"rewards/symbolic_reward_partial_score/mean": 0.9075521230697632,
"rewards/symbolic_reward_partial_score/std": 0.2654723823070526,
"rewards/tag_count_reward/mean": -0.091796875,
"rewards/tag_count_reward/std": 0.289021372795105,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.050972580909729,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 308.0,
"sampling/sampling_logp_difference/mean": 4.5054779052734375,
"step": 1321
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.484375,
"entropy": 0.30050645768642426,
"epoch": 3.4789473684210526,
"grad_norm": 0.1077585518360138,
"learning_rate": 1e-06,
"loss": 0.2148,
"step": 1322
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.3242476284503937,
"epoch": 3.481578947368421,
"grad_norm": 0.015654178336262703,
"learning_rate": 1e-06,
"loss": 0.1183,
"step": 1323
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.3109499663114548,
"epoch": 3.4842105263157896,
"grad_norm": 0.02455841936171055,
"learning_rate": 1e-06,
"loss": 0.1862,
"step": 1324
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.158203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8314.0,
"completions/mean_length": 3150.29296875,
"completions/mean_terminated_length": 663.2157592773438,
"completions/min_length": 246.0,
"completions/min_terminated_length": 246.0,
"entropy": 0.27029645442962646,
"epoch": 3.486842105263158,
"frac_reward_zero_std": 0.125,
"grad_norm": 303.6960144042969,
"learning_rate": 1e-06,
"loss": 0.1961,
"num_tokens": 436880172.0,
"reward": 0.7058703899383545,
"reward_std": 0.2576483488082886,
"rewards/progression_diversity/mean": -0.057497672736644745,
"rewards/progression_diversity/std": 0.13345807790756226,
"rewards/symbolic_reward_accuracy/mean": 0.76953125,
"rewards/symbolic_reward_accuracy/std": 0.42154473066329956,
"rewards/symbolic_reward_partial_score/mean": 0.8619791269302368,
"rewards/symbolic_reward_partial_score/std": 0.3079785406589508,
"rewards/tag_count_reward/mean": -0.138671875,
"rewards/tag_count_reward/std": 0.34594178199768066,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0368916988372803,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 308.0,
"sampling/sampling_logp_difference/mean": 5.3984527587890625,
"step": 1325
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4765625,
"entropy": 0.2876896560192108,
"epoch": 3.4894736842105263,
"grad_norm": 0.016265718266367912,
"learning_rate": 1e-06,
"loss": 0.1761,
"step": 1326
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4921875,
"entropy": 0.27050982415676117,
"epoch": 3.4921052631578946,
"grad_norm": 1.3987834453582764,
"learning_rate": 1e-06,
"loss": 0.269,
"step": 1327
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.359375,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5,
"entropy": 0.28464990854263306,
"epoch": 3.4947368421052634,
"grad_norm": 32.02823257446289,
"learning_rate": 1e-06,
"loss": 0.1709,
"step": 1328
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10308.0,
"completions/mean_length": 1944.478515625,
"completions/mean_terminated_length": 654.1383056640625,
"completions/min_length": 227.0,
"completions/min_terminated_length": 227.0,
"entropy": 0.2990902215242386,
"epoch": 3.4973684210526317,
"frac_reward_zero_std": 0.1875,
"grad_norm": 411.1249694824219,
"learning_rate": 1e-06,
"loss": 0.2033,
"num_tokens": 438276225.0,
"reward": 0.7881462574005127,
"reward_std": 0.22861242294311523,
"rewards/progression_diversity/mean": -0.03303021937608719,
"rewards/progression_diversity/std": 0.1093946024775505,
"rewards/symbolic_reward_accuracy/mean": 0.87109375,
"rewards/symbolic_reward_accuracy/std": 0.33542385697364807,
"rewards/symbolic_reward_partial_score/mean": 0.9140625,
"rewards/symbolic_reward_partial_score/std": 0.2569299340248108,
"rewards/tag_count_reward/mean": -0.083984375,
"rewards/tag_count_reward/std": 0.2776356339454651,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.038718819618225,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 308.0,
"sampling/sampling_logp_difference/mean": 5.05161190032959,
"step": 1329
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.3089918792247772,
"epoch": 3.5,
"grad_norm": 0.021115725859999657,
"learning_rate": 1e-06,
"loss": 0.1788,
"step": 1330
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.3167189657688141,
"epoch": 3.5026315789473683,
"grad_norm": 0.010390168987214565,
"learning_rate": 1e-06,
"loss": 0.1894,
"step": 1331
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.33414115011692047,
"epoch": 3.5052631578947366,
"grad_norm": 0.01447315514087677,
"learning_rate": 1e-06,
"loss": 0.0922,
"step": 1332
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.12109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3667.0,
"completions/mean_length": 2530.16015625,
"completions/mean_terminated_length": 621.4088745117188,
"completions/min_length": 273.0,
"completions/min_terminated_length": 273.0,
"entropy": 0.2866598814725876,
"epoch": 3.5078947368421054,
"frac_reward_zero_std": 0.15625,
"grad_norm": 1008.520751953125,
"learning_rate": 1e-06,
"loss": 0.2519,
"num_tokens": 439965971.0,
"reward": 0.7736403942108154,
"reward_std": 0.23456132411956787,
"rewards/progression_diversity/mean": -0.0480697825551033,
"rewards/progression_diversity/std": 0.13115154206752777,
"rewards/symbolic_reward_accuracy/mean": 0.861328125,
"rewards/symbolic_reward_accuracy/std": 0.34594178199768066,
"rewards/symbolic_reward_partial_score/mean": 0.8948568105697632,
"rewards/symbolic_reward_partial_score/std": 0.2902930974960327,
"rewards/tag_count_reward/mean": -0.111328125,
"rewards/tag_count_reward/std": 0.31484565138816833,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0280325412750244,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 308.0,
"sampling/sampling_logp_difference/mean": 5.589191436767578,
"step": 1333
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.30402611196041107,
"epoch": 3.5105263157894737,
"grad_norm": 1195.4456787109375,
"learning_rate": 1e-06,
"loss": 0.2695,
"step": 1334
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.29913991689682007,
"epoch": 3.513157894736842,
"grad_norm": 0.3809571862220764,
"learning_rate": 1e-06,
"loss": 0.1482,
"step": 1335
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4921875,
"entropy": 0.31326740980148315,
"epoch": 3.515789473684211,
"grad_norm": 1.5359933376312256,
"learning_rate": 1e-06,
"loss": 0.1986,
"step": 1336
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10774.0,
"completions/mean_length": 2040.724609375,
"completions/mean_terminated_length": 624.8648071289062,
"completions/min_length": 225.0,
"completions/min_terminated_length": 225.0,
"entropy": 0.3067469596862793,
"epoch": 3.518421052631579,
"frac_reward_zero_std": 0.34375,
"grad_norm": 130.65467834472656,
"learning_rate": 1e-06,
"loss": 0.1706,
"num_tokens": 441414950.0,
"reward": 0.7661923170089722,
"reward_std": 0.18958313763141632,
"rewards/progression_diversity/mean": -0.03604454547166824,
"rewards/progression_diversity/std": 0.11484090983867645,
"rewards/symbolic_reward_accuracy/mean": 0.83984375,
"rewards/symbolic_reward_accuracy/std": 0.3671095669269562,
"rewards/symbolic_reward_partial_score/mean": 0.9021810293197632,
"rewards/symbolic_reward_partial_score/std": 0.2684669494628906,
"rewards/tag_count_reward/mean": -0.080078125,
"rewards/tag_count_reward/std": 0.271679550409317,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0413990020751953,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 308.0,
"sampling/sampling_logp_difference/mean": 4.274114608764648,
"step": 1337
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.3064485937356949,
"epoch": 3.5210526315789474,
"grad_norm": 4.137460708618164,
"learning_rate": 1e-06,
"loss": 0.1474,
"step": 1338
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.30475564301013947,
"epoch": 3.5236842105263158,
"grad_norm": 0.25504910945892334,
"learning_rate": 1e-06,
"loss": 0.1556,
"step": 1339
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.3473477065563202,
"epoch": 3.526315789473684,
"grad_norm": 0.14421966671943665,
"learning_rate": 1e-06,
"loss": 0.076,
"step": 1340
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.119140625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15636.0,
"completions/mean_length": 2503.556640625,
"completions/mean_terminated_length": 626.1574096679688,
"completions/min_length": 223.0,
"completions/min_terminated_length": 223.0,
"entropy": 0.3233236074447632,
"epoch": 3.5289473684210524,
"frac_reward_zero_std": 0.21875,
"grad_norm": 764.384765625,
"learning_rate": 1e-06,
"loss": 0.1338,
"num_tokens": 443064707.0,
"reward": 0.7563366889953613,
"reward_std": 0.2102624773979187,
"rewards/progression_diversity/mean": -0.04993094503879547,
"rewards/progression_diversity/std": 0.1373254358768463,
"rewards/symbolic_reward_accuracy/mean": 0.8359375,
"rewards/symbolic_reward_accuracy/std": 0.37069445848464966,
"rewards/symbolic_reward_partial_score/mean": 0.8860677480697632,
"rewards/symbolic_reward_partial_score/std": 0.28898024559020996,
"rewards/tag_count_reward/mean": -0.10546875,
"rewards/tag_count_reward/std": 0.3074568510055542,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0196648836135864,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 308.0,
"sampling/sampling_logp_difference/mean": 6.12931489944458,
"step": 1341
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.2928446978330612,
"epoch": 3.531578947368421,
"grad_norm": 0.5019714832305908,
"learning_rate": 1e-06,
"loss": 0.224,
"step": 1342
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.3110320568084717,
"epoch": 3.5342105263157895,
"grad_norm": 0.13324137032032013,
"learning_rate": 1e-06,
"loss": 0.1591,
"step": 1343
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.31039758026599884,
"epoch": 3.536842105263158,
"grad_norm": 0.014469148591160774,
"learning_rate": 1e-06,
"loss": 0.2039,
"step": 1344
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.111328125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2137.0,
"completions/mean_length": 2396.134765625,
"completions/mean_terminated_length": 643.808837890625,
"completions/min_length": 264.0,
"completions/min_terminated_length": 264.0,
"entropy": 0.33016908168792725,
"epoch": 3.5394736842105265,
"frac_reward_zero_std": 0.25,
"grad_norm": 333.5965576171875,
"learning_rate": 1e-06,
"loss": 0.108,
"num_tokens": 444689480.0,
"reward": 0.7707592844963074,
"reward_std": 0.21156515181064606,
"rewards/progression_diversity/mean": -0.04321339726448059,
"rewards/progression_diversity/std": 0.1244661882519722,
"rewards/symbolic_reward_accuracy/mean": 0.84765625,
"rewards/symbolic_reward_accuracy/std": 0.35970520973205566,
"rewards/symbolic_reward_partial_score/mean": 0.9091796875,
"rewards/symbolic_reward_partial_score/std": 0.25746801495552063,
"rewards/tag_count_reward/mean": -0.1015625,
"rewards/tag_count_reward/std": 0.30236753821372986,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.030545949935913,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 308.0,
"sampling/sampling_logp_difference/mean": 5.220053672790527,
"step": 1345
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.3137245178222656,
"epoch": 3.542105263157895,
"grad_norm": 0.03173833340406418,
"learning_rate": 1e-06,
"loss": 0.1528,
"step": 1346
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.3061438798904419,
"epoch": 3.544736842105263,
"grad_norm": 0.012165974825620651,
"learning_rate": 1e-06,
"loss": 0.1486,
"step": 1347
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3359375,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.3039957582950592,
"epoch": 3.5473684210526315,
"grad_norm": 0.023023858666419983,
"learning_rate": 1e-06,
"loss": 0.2288,
"step": 1348
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.115234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9747.0,
"completions/mean_length": 2442.896484375,
"completions/mean_terminated_length": 627.1677856445312,
"completions/min_length": 241.0,
"completions/min_terminated_length": 241.0,
"entropy": 0.31957219541072845,
"epoch": 3.55,
"frac_reward_zero_std": 0.21875,
"grad_norm": 117.19609069824219,
"learning_rate": 1e-06,
"loss": 0.1076,
"num_tokens": 446334707.0,
"reward": 0.7665076851844788,
"reward_std": 0.2067509889602661,
"rewards/progression_diversity/mean": -0.04357065260410309,
"rewards/progression_diversity/std": 0.12135568261146545,
"rewards/symbolic_reward_accuracy/mean": 0.845703125,
"rewards/symbolic_reward_accuracy/std": 0.36158639192581177,
"rewards/symbolic_reward_partial_score/mean": 0.90087890625,
"rewards/symbolic_reward_partial_score/std": 0.2657473385334015,
"rewards/tag_count_reward/mean": -0.107421875,
"rewards/tag_count_reward/std": 0.30995169281959534,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0276845693588257,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 308.0,
"sampling/sampling_logp_difference/mean": 5.852321624755859,
"step": 1349
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.30125513672828674,
"epoch": 3.5526315789473686,
"grad_norm": 0.008969382382929325,
"learning_rate": 1e-06,
"loss": 0.2238,
"step": 1350
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.453125,
"entropy": 0.300917848944664,
"epoch": 3.555263157894737,
"grad_norm": 0.008844579569995403,
"learning_rate": 1e-06,
"loss": 0.1436,
"step": 1351
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.46875,
"entropy": 0.28367167711257935,
"epoch": 3.557894736842105,
"grad_norm": 0.01489369384944439,
"learning_rate": 1e-06,
"loss": 0.2285,
"step": 1352
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.138671875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15503.0,
"completions/mean_length": 2874.439453125,
"completions/mean_terminated_length": 699.4308471679688,
"completions/min_length": 213.0,
"completions/min_terminated_length": 213.0,
"entropy": 0.29677602648735046,
"epoch": 3.5605263157894735,
"frac_reward_zero_std": 0.0625,
"grad_norm": 187.5312957763672,
"learning_rate": 1e-06,
"loss": 0.1843,
"num_tokens": 448214740.0,
"reward": 0.7094203233718872,
"reward_std": 0.2542072832584381,
"rewards/progression_diversity/mean": -0.04918673634529114,
"rewards/progression_diversity/std": 0.1233910620212555,
"rewards/symbolic_reward_accuracy/mean": 0.783203125,
"rewards/symbolic_reward_accuracy/std": 0.4124660789966583,
"rewards/symbolic_reward_partial_score/mean": 0.8396809697151184,
"rewards/symbolic_reward_partial_score/std": 0.3393942713737488,
"rewards/tag_count_reward/mean": -0.119140625,
"rewards/tag_count_reward/std": 0.32427072525024414,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0222926139831543,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 308.0,
"sampling/sampling_logp_difference/mean": 6.464677810668945,
"step": 1353
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.30209778249263763,
"epoch": 3.5631578947368423,
"grad_norm": 0.01030214224010706,
"learning_rate": 1e-06,
"loss": 0.1664,
"step": 1354
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3671875,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.515625,
"entropy": 0.2921713888645172,
"epoch": 3.5657894736842106,
"grad_norm": 0.3698064982891083,
"learning_rate": 1e-06,
"loss": 0.2283,
"step": 1355
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.453125,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.6171875,
"entropy": 0.2947758883237839,
"epoch": 3.568421052631579,
"grad_norm": 0.0077121201902627945,
"learning_rate": 1e-06,
"loss": 0.2436,
"step": 1356
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.111328125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10491.0,
"completions/mean_length": 2377.16796875,
"completions/mean_terminated_length": 622.4659423828125,
"completions/min_length": 229.0,
"completions/min_terminated_length": 229.0,
"entropy": 0.30925987660884857,
"epoch": 3.5710526315789473,
"frac_reward_zero_std": 0.15625,
"grad_norm": 539.5180053710938,
"learning_rate": 1e-06,
"loss": 0.1286,
"num_tokens": 449834314.0,
"reward": 0.7453286647796631,
"reward_std": 0.21328219771385193,
"rewards/progression_diversity/mean": -0.03745080530643463,
"rewards/progression_diversity/std": 0.10799506306648254,
"rewards/symbolic_reward_accuracy/mean": 0.822265625,
"rewards/symbolic_reward_accuracy/std": 0.3826628625392914,
"rewards/symbolic_reward_partial_score/mean": 0.8756510019302368,
"rewards/symbolic_reward_partial_score/std": 0.29454636573791504,
"rewards/tag_count_reward/mean": -0.103515625,
"rewards/tag_count_reward/std": 0.30492907762527466,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.033726692199707,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 312.0,
"sampling/sampling_logp_difference/mean": 5.676729202270508,
"step": 1357
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.31313398480415344,
"epoch": 3.5736842105263156,
"grad_norm": 0.014206201769411564,
"learning_rate": 1e-06,
"loss": 0.1543,
"step": 1358
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3828125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5078125,
"entropy": 0.2981453835964203,
"epoch": 3.5763157894736843,
"grad_norm": 0.24512311816215515,
"learning_rate": 1e-06,
"loss": 0.2296,
"step": 1359
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4921875,
"entropy": 0.3176209479570389,
"epoch": 3.5789473684210527,
"grad_norm": 0.02592810057103634,
"learning_rate": 1e-06,
"loss": 0.2541,
"step": 1360
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.099609375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14099.0,
"completions/mean_length": 2200.619140625,
"completions/mean_terminated_length": 631.5249633789062,
"completions/min_length": 228.0,
"completions/min_terminated_length": 228.0,
"entropy": 0.320136159658432,
"epoch": 3.581578947368421,
"frac_reward_zero_std": 0.28125,
"grad_norm": 498.01348876953125,
"learning_rate": 1e-06,
"loss": 0.1276,
"num_tokens": 451356839.0,
"reward": 0.7508112192153931,
"reward_std": 0.2062763273715973,
"rewards/progression_diversity/mean": -0.031185226514935493,
"rewards/progression_diversity/std": 0.09589832276105881,
"rewards/symbolic_reward_accuracy/mean": 0.82421875,
"rewards/symbolic_reward_accuracy/std": 0.3810062110424042,
"rewards/symbolic_reward_partial_score/mean": 0.8839517831802368,
"rewards/symbolic_reward_partial_score/std": 0.2886301875114441,
"rewards/tag_count_reward/mean": -0.0859375,
"rewards/tag_count_reward/std": 0.28054583072662354,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0460567474365234,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 312.0,
"sampling/sampling_logp_difference/mean": 4.3595733642578125,
"step": 1361
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.3112920671701431,
"epoch": 3.5842105263157897,
"grad_norm": 0.022203197702765465,
"learning_rate": 1e-06,
"loss": 0.218,
"step": 1362
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.31792403757572174,
"epoch": 3.586842105263158,
"grad_norm": 0.020310793071985245,
"learning_rate": 1e-06,
"loss": 0.1697,
"step": 1363
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.3065248429775238,
"epoch": 3.5894736842105264,
"grad_norm": 0.00812375359237194,
"learning_rate": 1e-06,
"loss": 0.1274,
"step": 1364
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.138671875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11008.0,
"completions/mean_length": 2851.1796875,
"completions/mean_terminated_length": 672.4263305664062,
"completions/min_length": 232.0,
"completions/min_terminated_length": 232.0,
"entropy": 0.2910042405128479,
"epoch": 3.5921052631578947,
"frac_reward_zero_std": 0.15625,
"grad_norm": 208.19349670410156,
"learning_rate": 1e-06,
"loss": 0.2613,
"num_tokens": 453229795.0,
"reward": 0.699255645275116,
"reward_std": 0.22771009802818298,
"rewards/progression_diversity/mean": -0.04025688022375107,
"rewards/progression_diversity/std": 0.10407532006502151,
"rewards/symbolic_reward_accuracy/mean": 0.76171875,
"rewards/symbolic_reward_accuracy/std": 0.42644867300987244,
"rewards/symbolic_reward_partial_score/mean": 0.8439127802848816,
"rewards/symbolic_reward_partial_score/std": 0.31969720125198364,
"rewards/tag_count_reward/mean": -0.10546875,
"rewards/tag_count_reward/std": 0.3074568510055542,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0359622240066528,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 312.0,
"sampling/sampling_logp_difference/mean": 5.4447712898254395,
"step": 1365
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.2833098769187927,
"epoch": 3.594736842105263,
"grad_norm": 1.1845999956130981,
"learning_rate": 1e-06,
"loss": 0.1663,
"step": 1366
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.2765520215034485,
"epoch": 3.5973684210526313,
"grad_norm": 4.952581882476807,
"learning_rate": 1e-06,
"loss": 0.2748,
"step": 1367
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3359375,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4609375,
"entropy": 0.2997213453054428,
"epoch": 3.6,
"grad_norm": 0.008390809409320354,
"learning_rate": 1e-06,
"loss": 0.1059,
"step": 1368
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.150390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2273.0,
"completions/mean_length": 2958.927734375,
"completions/mean_terminated_length": 582.53564453125,
"completions/min_length": 225.0,
"completions/min_terminated_length": 225.0,
"entropy": 0.2849253863096237,
"epoch": 3.6026315789473684,
"frac_reward_zero_std": 0.125,
"grad_norm": 119.27400207519531,
"learning_rate": 1e-06,
"loss": 0.1874,
"num_tokens": 455127902.0,
"reward": 0.7533248662948608,
"reward_std": 0.24902209639549255,
"rewards/progression_diversity/mean": -0.048372894525527954,
"rewards/progression_diversity/std": 0.11722088605165482,
"rewards/symbolic_reward_accuracy/mean": 0.833984375,
"rewards/symbolic_reward_accuracy/std": 0.3724585771560669,
"rewards/symbolic_reward_partial_score/mean": 0.8759765625,
"rewards/symbolic_reward_partial_score/std": 0.3028711676597595,
"rewards/tag_count_reward/mean": -0.09375,
"rewards/tag_count_reward/std": 0.29176566004753113,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.029304027557373,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 312.0,
"sampling/sampling_logp_difference/mean": 6.3514299392700195,
"step": 1369
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.29380667209625244,
"epoch": 3.6052631578947367,
"grad_norm": 4.995346546173096,
"learning_rate": 1e-06,
"loss": 0.209,
"step": 1370
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.46875,
"entropy": 0.2862599343061447,
"epoch": 3.6078947368421055,
"grad_norm": 794.2086181640625,
"learning_rate": 1e-06,
"loss": 0.2462,
"step": 1371
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.26471564173698425,
"epoch": 3.610526315789474,
"grad_norm": 0.02694852091372013,
"learning_rate": 1e-06,
"loss": 0.2141,
"step": 1372
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.18359375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2655.0,
"completions/mean_length": 3480.74609375,
"completions/mean_terminated_length": 579.057373046875,
"completions/min_length": 224.0,
"completions/min_terminated_length": 224.0,
"entropy": 0.29130323231220245,
"epoch": 3.613157894736842,
"frac_reward_zero_std": 0.0625,
"grad_norm": 67.4537353515625,
"learning_rate": 1e-06,
"loss": 0.1288,
"num_tokens": 457293500.0,
"reward": 0.7533182501792908,
"reward_std": 0.19987264275550842,
"rewards/progression_diversity/mean": -0.0636826902627945,
"rewards/progression_diversity/std": 0.13695798814296722,
"rewards/symbolic_reward_accuracy/mean": 0.83203125,
"rewards/symbolic_reward_accuracy/std": 0.374204158782959,
"rewards/symbolic_reward_partial_score/mean": 0.8790690302848816,
"rewards/symbolic_reward_partial_score/std": 0.2996877133846283,
"rewards/tag_count_reward/mean": -0.08984375,
"rewards/tag_count_reward/std": 0.2862374484539032,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0184675455093384,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 312.0,
"sampling/sampling_logp_difference/mean": 7.7314229011535645,
"step": 1373
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.24614471197128296,
"epoch": 3.6157894736842104,
"grad_norm": 132.7631378173828,
"learning_rate": 1e-06,
"loss": 0.2735,
"step": 1374
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.1953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5078125,
"entropy": 0.23733964562416077,
"epoch": 3.6184210526315788,
"grad_norm": 0.007347916718572378,
"learning_rate": 1e-06,
"loss": 0.2669,
"step": 1375
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.4140625,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5546875,
"entropy": 0.262113094329834,
"epoch": 3.6210526315789475,
"grad_norm": 0.00864808913320303,
"learning_rate": 1e-06,
"loss": 0.2765,
"step": 1376
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1328125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2650.0,
"completions/mean_length": 2676.58984375,
"completions/mean_terminated_length": 577.2567749023438,
"completions/min_length": 259.0,
"completions/min_terminated_length": 259.0,
"entropy": 0.2994058430194855,
"epoch": 3.623684210526316,
"frac_reward_zero_std": 0.1875,
"grad_norm": 173.56097412109375,
"learning_rate": 1e-06,
"loss": 0.1616,
"num_tokens": 459061546.0,
"reward": 0.8030776977539062,
"reward_std": 0.17727278172969818,
"rewards/progression_diversity/mean": -0.04867924749851227,
"rewards/progression_diversity/std": 0.1260039210319519,
"rewards/symbolic_reward_accuracy/mean": 0.892578125,
"rewards/symbolic_reward_accuracy/std": 0.30995169281959534,
"rewards/symbolic_reward_partial_score/mean": 0.9161783456802368,
"rewards/symbolic_reward_partial_score/std": 0.2622043490409851,
"rewards/tag_count_reward/mean": -0.068359375,
"rewards/tag_count_reward/std": 0.25260838866233826,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0174307823181152,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 312.0,
"sampling/sampling_logp_difference/mean": 7.301526069641113,
"step": 1377
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.2790711969137192,
"epoch": 3.626315789473684,
"grad_norm": 6.741792678833008,
"learning_rate": 1e-06,
"loss": 0.3263,
"step": 1378
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.359375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.28607361018657684,
"epoch": 3.6289473684210525,
"grad_norm": 0.06569147855043411,
"learning_rate": 1e-06,
"loss": 0.1445,
"step": 1379
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.2860715389251709,
"epoch": 3.6315789473684212,
"grad_norm": 0.07871459424495697,
"learning_rate": 1e-06,
"loss": 0.1596,
"step": 1380
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9562.0,
"completions/mean_length": 2981.08984375,
"completions/mean_terminated_length": 644.802734375,
"completions/min_length": 209.0,
"completions/min_terminated_length": 209.0,
"entropy": 0.28075069189071655,
"epoch": 3.6342105263157896,
"frac_reward_zero_std": 0.1875,
"grad_norm": 278.2060546875,
"learning_rate": 1e-06,
"loss": 0.1905,
"num_tokens": 460993176.0,
"reward": 0.6964207887649536,
"reward_std": 0.240301251411438,
"rewards/progression_diversity/mean": -0.0503075085580349,
"rewards/progression_diversity/std": 0.12393683940172195,
"rewards/symbolic_reward_accuracy/mean": 0.767578125,
"rewards/symbolic_reward_accuracy/std": 0.42278963327407837,
"rewards/symbolic_reward_partial_score/mean": 0.82958984375,
"rewards/symbolic_reward_partial_score/std": 0.34416452050209045,
"rewards/tag_count_reward/mean": -0.125,
"rewards/tag_count_reward/std": 0.3310423493385315,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0247937440872192,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 314.0,
"sampling/sampling_logp_difference/mean": 6.386007308959961,
"step": 1381
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.28298284113407135,
"epoch": 3.636842105263158,
"grad_norm": 0.013535849750041962,
"learning_rate": 1e-06,
"loss": 0.2295,
"step": 1382
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5078125,
"entropy": 0.2968677282333374,
"epoch": 3.639473684210526,
"grad_norm": 0.01233405340462923,
"learning_rate": 1e-06,
"loss": 0.1708,
"step": 1383
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.2959662228822708,
"epoch": 3.6421052631578945,
"grad_norm": 0.6275126934051514,
"learning_rate": 1e-06,
"loss": 0.1565,
"step": 1384
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.126953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12655.0,
"completions/mean_length": 2703.05078125,
"completions/mean_terminated_length": 713.6510009765625,
"completions/min_length": 219.0,
"completions/min_terminated_length": 219.0,
"entropy": 0.3054865449666977,
"epoch": 3.6447368421052633,
"frac_reward_zero_std": 0.03125,
"grad_norm": 223.1032257080078,
"learning_rate": 1e-06,
"loss": 0.1583,
"num_tokens": 462803986.0,
"reward": 0.7047585844993591,
"reward_std": 0.28779661655426025,
"rewards/progression_diversity/mean": -0.046601302921772,
"rewards/progression_diversity/std": 0.12478232383728027,
"rewards/symbolic_reward_accuracy/mean": 0.771484375,
"rewards/symbolic_reward_accuracy/std": 0.4202871024608612,
"rewards/symbolic_reward_partial_score/mean": 0.8448892831802368,
"rewards/symbolic_reward_partial_score/std": 0.32900720834732056,
"rewards/tag_count_reward/mean": -0.111328125,
"rewards/tag_count_reward/std": 0.31484565138816833,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0184571743011475,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 316.0,
"sampling/sampling_logp_difference/mean": 7.270891189575195,
"step": 1385
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.515625,
"entropy": 0.29785533249378204,
"epoch": 3.6473684210526316,
"grad_norm": 0.015050256624817848,
"learning_rate": 1e-06,
"loss": 0.229,
"step": 1386
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3828125,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5234375,
"entropy": 0.2764730453491211,
"epoch": 3.65,
"grad_norm": 0.25503814220428467,
"learning_rate": 1e-06,
"loss": 0.2498,
"step": 1387
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4765625,
"entropy": 0.290697917342186,
"epoch": 3.6526315789473687,
"grad_norm": 4.086553573608398,
"learning_rate": 1e-06,
"loss": 0.2406,
"step": 1388
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3594.0,
"completions/mean_length": 2599.453125,
"completions/mean_terminated_length": 630.232177734375,
"completions/min_length": 219.0,
"completions/min_terminated_length": 219.0,
"entropy": 0.3037455379962921,
"epoch": 3.655263157894737,
"frac_reward_zero_std": 0.0625,
"grad_norm": 311.0882873535156,
"learning_rate": 1e-06,
"loss": 0.1859,
"num_tokens": 464557082.0,
"reward": 0.7203487157821655,
"reward_std": 0.24300315976142883,
"rewards/progression_diversity/mean": -0.04520959407091141,
"rewards/progression_diversity/std": 0.12297887355089188,
"rewards/symbolic_reward_accuracy/mean": 0.7890625,
"rewards/symbolic_reward_accuracy/std": 0.4083731174468994,
"rewards/symbolic_reward_partial_score/mean": 0.8623046875,
"rewards/symbolic_reward_partial_score/std": 0.31785058975219727,
"rewards/tag_count_reward/mean": -0.11328125,
"rewards/tag_count_reward/std": 0.3172462284564972,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0264785289764404,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 318.0,
"sampling/sampling_logp_difference/mean": 6.508614540100098,
"step": 1389
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.4453125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5625,
"entropy": 0.30377230048179626,
"epoch": 3.6578947368421053,
"grad_norm": 0.015176265500485897,
"learning_rate": 1e-06,
"loss": 0.1212,
"step": 1390
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.421875,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.578125,
"entropy": 0.3056159168481827,
"epoch": 3.6605263157894736,
"grad_norm": 0.1255595088005066,
"learning_rate": 1e-06,
"loss": 0.2117,
"step": 1391
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3828125,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.546875,
"entropy": 0.29334986209869385,
"epoch": 3.663157894736842,
"grad_norm": 0.023698071017861366,
"learning_rate": 1e-06,
"loss": 0.2919,
"step": 1392
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2280.0,
"completions/mean_length": 1780.5703125,
"completions/mean_terminated_length": 609.8311767578125,
"completions/min_length": 221.0,
"completions/min_terminated_length": 221.0,
"entropy": 0.32029321789741516,
"epoch": 3.6657894736842103,
"frac_reward_zero_std": 0.375,
"grad_norm": 217.3709259033203,
"learning_rate": 1e-06,
"loss": 0.1288,
"num_tokens": 465858846.0,
"reward": 0.7777209281921387,
"reward_std": 0.20324808359146118,
"rewards/progression_diversity/mean": -0.025765735656023026,
"rewards/progression_diversity/std": 0.09416744112968445,
"rewards/symbolic_reward_accuracy/mean": 0.853515625,
"rewards/symbolic_reward_accuracy/std": 0.35393697023391724,
"rewards/symbolic_reward_partial_score/mean": 0.90771484375,
"rewards/symbolic_reward_partial_score/std": 0.2606476843357086,
"rewards/tag_count_reward/mean": -0.064453125,
"rewards/tag_count_reward/std": 0.24579854309558868,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0482451915740967,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 320.0,
"sampling/sampling_logp_difference/mean": 4.223204612731934,
"step": 1393
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.31482554972171783,
"epoch": 3.668421052631579,
"grad_norm": 2.6809914112091064,
"learning_rate": 1e-06,
"loss": 0.0911,
"step": 1394
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.3331180214881897,
"epoch": 3.6710526315789473,
"grad_norm": 0.01906052976846695,
"learning_rate": 1e-06,
"loss": 0.1226,
"step": 1395
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.32584846019744873,
"epoch": 3.6736842105263157,
"grad_norm": 60.955745697021484,
"learning_rate": 1e-06,
"loss": 0.1089,
"step": 1396
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2274.0,
"completions/mean_length": 2306.021484375,
"completions/mean_terminated_length": 646.1724853515625,
"completions/min_length": 250.0,
"completions/min_terminated_length": 250.0,
"entropy": 0.3163197785615921,
"epoch": 3.6763157894736844,
"frac_reward_zero_std": 0.28125,
"grad_norm": 166.03375244140625,
"learning_rate": 1e-06,
"loss": 0.1648,
"num_tokens": 467440009.0,
"reward": 0.7467856407165527,
"reward_std": 0.22565452754497528,
"rewards/progression_diversity/mean": -0.038237735629081726,
"rewards/progression_diversity/std": 0.11381914466619492,
"rewards/symbolic_reward_accuracy/mean": 0.8203125,
"rewards/symbolic_reward_accuracy/std": 0.38430243730545044,
"rewards/symbolic_reward_partial_score/mean": 0.8811848759651184,
"rewards/symbolic_reward_partial_score/std": 0.2920258939266205,
"rewards/tag_count_reward/mean": -0.09375,
"rewards/tag_count_reward/std": 0.29176566004753113,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.042597770690918,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 320.0,
"sampling/sampling_logp_difference/mean": 5.3379316329956055,
"step": 1397
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.31748393177986145,
"epoch": 3.6789473684210527,
"grad_norm": 256.6326599121094,
"learning_rate": 1e-06,
"loss": 0.1745,
"step": 1398
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.30522364377975464,
"epoch": 3.681578947368421,
"grad_norm": 2.4348554611206055,
"learning_rate": 1e-06,
"loss": 0.1645,
"step": 1399
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.30944982171058655,
"epoch": 3.6842105263157894,
"grad_norm": 3.376948356628418,
"learning_rate": 1e-06,
"loss": 0.1815,
"step": 1400
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2336.0,
"completions/mean_length": 1631.30078125,
"completions/mean_terminated_length": 581.9456176757812,
"completions/min_length": 235.0,
"completions/min_terminated_length": 235.0,
"entropy": 0.3202086091041565,
"epoch": 3.6868421052631577,
"frac_reward_zero_std": 0.40625,
"grad_norm": 520.1287841796875,
"learning_rate": 1e-06,
"loss": 0.1837,
"num_tokens": 468665507.0,
"reward": 0.823440432548523,
"reward_std": 0.1710093915462494,
"rewards/progression_diversity/mean": -0.024127831682562828,
"rewards/progression_diversity/std": 0.09204965829849243,
"rewards/symbolic_reward_accuracy/mean": 0.916015625,
"rewards/symbolic_reward_accuracy/std": 0.2776356339454651,
"rewards/symbolic_reward_partial_score/mean": 0.93310546875,
"rewards/symbolic_reward_partial_score/std": 0.2400180548429489,
"rewards/tag_count_reward/mean": -0.05859375,
"rewards/tag_count_reward/std": 0.23509246110916138,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0514576435089111,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 322.0,
"sampling/sampling_logp_difference/mean": 4.249440670013428,
"step": 1401
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.33165352046489716,
"epoch": 3.6894736842105265,
"grad_norm": 0.17460408806800842,
"learning_rate": 1e-06,
"loss": 0.1594,
"step": 1402
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.3367105722427368,
"epoch": 3.692105263157895,
"grad_norm": 0.006330076605081558,
"learning_rate": 1e-06,
"loss": 0.1336,
"step": 1403
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.3383978605270386,
"epoch": 3.694736842105263,
"grad_norm": 0.006822290364652872,
"learning_rate": 1e-06,
"loss": 0.0894,
"step": 1404
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.111328125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2007.0,
"completions/mean_length": 2382.189453125,
"completions/mean_terminated_length": 628.1165161132812,
"completions/min_length": 239.0,
"completions/min_terminated_length": 239.0,
"entropy": 0.30853356420993805,
"epoch": 3.6973684210526314,
"frac_reward_zero_std": 0.3125,
"grad_norm": 92.30716705322266,
"learning_rate": 1e-06,
"loss": 0.0998,
"num_tokens": 470292484.0,
"reward": 0.7492548823356628,
"reward_std": 0.22711637616157532,
"rewards/progression_diversity/mean": -0.04033348336815834,
"rewards/progression_diversity/std": 0.11694171279668808,
"rewards/symbolic_reward_accuracy/mean": 0.828125,
"rewards/symbolic_reward_accuracy/std": 0.3776407241821289,
"rewards/symbolic_reward_partial_score/mean": 0.8758138418197632,
"rewards/symbolic_reward_partial_score/std": 0.30300623178482056,
"rewards/tag_count_reward/mean": -0.099609375,
"rewards/tag_count_reward/std": 0.29977133870124817,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0440869331359863,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 324.0,
"sampling/sampling_logp_difference/mean": 4.415769100189209,
"step": 1405
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.3130802512168884,
"epoch": 3.7,
"grad_norm": 4.948612689971924,
"learning_rate": 1e-06,
"loss": 0.1542,
"step": 1406
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.2954626977443695,
"epoch": 3.7026315789473685,
"grad_norm": 214.0167694091797,
"learning_rate": 1e-06,
"loss": 0.2376,
"step": 1407
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.29514195024967194,
"epoch": 3.705263157894737,
"grad_norm": 0.019930332899093628,
"learning_rate": 1e-06,
"loss": 0.1884,
"step": 1408
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2248.0,
"completions/mean_length": 1681.869140625,
"completions/mean_terminated_length": 569.9432983398438,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"entropy": 0.3324284106492996,
"epoch": 3.707894736842105,
"frac_reward_zero_std": 0.21875,
"grad_norm": 933.0283203125,
"learning_rate": 1e-06,
"loss": 0.1402,
"num_tokens": 471541729.0,
"reward": 0.8024474382400513,
"reward_std": 0.21313899755477905,
"rewards/progression_diversity/mean": -0.028701424598693848,
"rewards/progression_diversity/std": 0.10557378083467484,
"rewards/symbolic_reward_accuracy/mean": 0.884765625,
"rewards/symbolic_reward_accuracy/std": 0.3196168541908264,
"rewards/symbolic_reward_partial_score/mean": 0.9270833134651184,
"rewards/symbolic_reward_partial_score/std": 0.2357022613286972,
"rewards/tag_count_reward/mean": -0.0625,
"rewards/tag_count_reward/std": 0.2422981858253479,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0388381481170654,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 324.0,
"sampling/sampling_logp_difference/mean": 4.987360954284668,
"step": 1409
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.3176589757204056,
"epoch": 3.7105263157894735,
"grad_norm": 0.014979632571339607,
"learning_rate": 1e-06,
"loss": 0.174,
"step": 1410
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.32269425690174103,
"epoch": 3.713157894736842,
"grad_norm": 0.00700529245659709,
"learning_rate": 1e-06,
"loss": 0.1423,
"step": 1411
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.3213765025138855,
"epoch": 3.7157894736842105,
"grad_norm": 0.2787642180919647,
"learning_rate": 1e-06,
"loss": 0.1043,
"step": 1412
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3836.0,
"completions/mean_length": 2176.283203125,
"completions/mean_terminated_length": 638.6514892578125,
"completions/min_length": 236.0,
"completions/min_terminated_length": 236.0,
"entropy": 0.31092438101768494,
"epoch": 3.718421052631579,
"frac_reward_zero_std": 0.1875,
"grad_norm": 331.7125244140625,
"learning_rate": 1e-06,
"loss": 0.171,
"num_tokens": 473063122.0,
"reward": 0.7549986839294434,
"reward_std": 0.18977715075016022,
"rewards/progression_diversity/mean": -0.037239447236061096,
"rewards/progression_diversity/std": 0.11695530265569687,
"rewards/symbolic_reward_accuracy/mean": 0.826171875,
"rewards/symbolic_reward_accuracy/std": 0.3793322443962097,
"rewards/symbolic_reward_partial_score/mean": 0.8948567509651184,
"rewards/symbolic_reward_partial_score/std": 0.2710212469100952,
"rewards/tag_count_reward/mean": -0.087890625,
"rewards/tag_count_reward/std": 0.2834126651287079,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0411852598190308,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 326.0,
"sampling/sampling_logp_difference/mean": 4.672489166259766,
"step": 1413
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.29592007398605347,
"epoch": 3.7210526315789476,
"grad_norm": 225.3509979248047,
"learning_rate": 1e-06,
"loss": 0.211,
"step": 1414
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.4140625,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.515625,
"entropy": 0.3040802776813507,
"epoch": 3.723684210526316,
"grad_norm": 0.005704566836357117,
"learning_rate": 1e-06,
"loss": 0.1706,
"step": 1415
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.4140625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.484375,
"entropy": 0.33103369176387787,
"epoch": 3.7263157894736842,
"grad_norm": 0.011854211799800396,
"learning_rate": 1e-06,
"loss": 0.0719,
"step": 1416
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.083984375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2048.0,
"completions/mean_length": 1917.099609375,
"completions/mean_terminated_length": 590.7100219726562,
"completions/min_length": 235.0,
"completions/min_terminated_length": 235.0,
"entropy": 0.295977458357811,
"epoch": 3.7289473684210526,
"frac_reward_zero_std": 0.21875,
"grad_norm": 196.44320678710938,
"learning_rate": 1e-06,
"loss": 0.2175,
"num_tokens": 474441989.0,
"reward": 0.7707229852676392,
"reward_std": 0.23186513781547546,
"rewards/progression_diversity/mean": -0.03708141669631004,
"rewards/progression_diversity/std": 0.12436394393444061,
"rewards/symbolic_reward_accuracy/mean": 0.84765625,
"rewards/symbolic_reward_accuracy/std": 0.35970520973205566,
"rewards/symbolic_reward_partial_score/mean": 0.8990885019302368,
"rewards/symbolic_reward_partial_score/std": 0.27446648478507996,
"rewards/tag_count_reward/mean": -0.072265625,
"rewards/tag_count_reward/std": 0.2591804563999176,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0268856287002563,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 328.0,
"sampling/sampling_logp_difference/mean": 5.979172229766846,
"step": 1417
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.31002572178840637,
"epoch": 3.731578947368421,
"grad_norm": 1676.92724609375,
"learning_rate": 1e-06,
"loss": 0.4721,
"step": 1418
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.3273525983095169,
"epoch": 3.734210526315789,
"grad_norm": 0.009710345417261124,
"learning_rate": 1e-06,
"loss": 0.0811,
"step": 1419
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.3278929442167282,
"epoch": 3.736842105263158,
"grad_norm": 0.014435559511184692,
"learning_rate": 1e-06,
"loss": 0.145,
"step": 1420
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11328125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2835.0,
"completions/mean_length": 2402.435546875,
"completions/mean_terminated_length": 616.2444458007812,
"completions/min_length": 224.0,
"completions/min_terminated_length": 224.0,
"entropy": 0.308517649769783,
"epoch": 3.7394736842105263,
"frac_reward_zero_std": 0.1875,
"grad_norm": 69.96543884277344,
"learning_rate": 1e-06,
"loss": 0.1305,
"num_tokens": 476068004.0,
"reward": 0.755204439163208,
"reward_std": 0.24294275045394897,
"rewards/progression_diversity/mean": -0.050845738500356674,
"rewards/progression_diversity/std": 0.14557471871376038,
"rewards/symbolic_reward_accuracy/mean": 0.8359375,
"rewards/symbolic_reward_accuracy/std": 0.37069445848464966,
"rewards/symbolic_reward_partial_score/mean": 0.88037109375,
"rewards/symbolic_reward_partial_score/std": 0.296750545501709,
"rewards/tag_count_reward/mean": -0.099609375,
"rewards/tag_count_reward/std": 0.29977133870124817,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0160291194915771,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 328.0,
"sampling/sampling_logp_difference/mean": 7.112195014953613,
"step": 1421
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.29520660638809204,
"epoch": 3.7421052631578946,
"grad_norm": 1305.361083984375,
"learning_rate": 1e-06,
"loss": 0.298,
"step": 1422
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.2944742292165756,
"epoch": 3.7447368421052634,
"grad_norm": 1.5552692413330078,
"learning_rate": 1e-06,
"loss": 0.1569,
"step": 1423
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.2920891344547272,
"epoch": 3.7473684210526317,
"grad_norm": 0.0419452041387558,
"learning_rate": 1e-06,
"loss": 0.2029,
"step": 1424
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2503.0,
"completions/mean_length": 2013.22265625,
"completions/mean_terminated_length": 594.6480712890625,
"completions/min_length": 217.0,
"completions/min_terminated_length": 217.0,
"entropy": 0.30936600267887115,
"epoch": 3.75,
"frac_reward_zero_std": 0.4375,
"grad_norm": 129.8343048095703,
"learning_rate": 1e-06,
"loss": 0.0752,
"num_tokens": 477496566.0,
"reward": 0.7965672016143799,
"reward_std": 0.15301448106765747,
"rewards/progression_diversity/mean": -0.03566431626677513,
"rewards/progression_diversity/std": 0.11889361590147018,
"rewards/symbolic_reward_accuracy/mean": 0.87890625,
"rewards/symbolic_reward_accuracy/std": 0.3265552520751953,
"rewards/symbolic_reward_partial_score/mean": 0.9246419072151184,
"rewards/symbolic_reward_partial_score/std": 0.23886118829250336,
"rewards/tag_count_reward/mean": -0.078125,
"rewards/tag_count_reward/std": 0.26863065361976624,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0362024307250977,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 330.0,
"sampling/sampling_logp_difference/mean": 4.878294467926025,
"step": 1425
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.32300904393196106,
"epoch": 3.7526315789473683,
"grad_norm": 0.02335791103541851,
"learning_rate": 1e-06,
"loss": 0.1367,
"step": 1426
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.31392285227775574,
"epoch": 3.7552631578947366,
"grad_norm": 0.0137960035353899,
"learning_rate": 1e-06,
"loss": 0.1808,
"step": 1427
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.3122747987508774,
"epoch": 3.7578947368421054,
"grad_norm": 0.0074070231057703495,
"learning_rate": 1e-06,
"loss": 0.1176,
"step": 1428
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.103515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11201.0,
"completions/mean_length": 2229.384765625,
"completions/mean_terminated_length": 594.9738159179688,
"completions/min_length": 246.0,
"completions/min_terminated_length": 246.0,
"entropy": 0.3264199197292328,
"epoch": 3.7605263157894737,
"frac_reward_zero_std": 0.3125,
"grad_norm": 201.52215576171875,
"learning_rate": 1e-06,
"loss": 0.0746,
"num_tokens": 479012763.0,
"reward": 0.7792283296585083,
"reward_std": 0.16983628273010254,
"rewards/progression_diversity/mean": -0.041039757430553436,
"rewards/progression_diversity/std": 0.12305484712123871,
"rewards/symbolic_reward_accuracy/mean": 0.859375,
"rewards/symbolic_reward_accuracy/std": 0.3479743003845215,
"rewards/symbolic_reward_partial_score/mean": 0.91064453125,
"rewards/symbolic_reward_partial_score/std": 0.25674015283584595,
"rewards/tag_count_reward/mean": -0.091796875,
"rewards/tag_count_reward/std": 0.289021372795105,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0295979976654053,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 332.0,
"sampling/sampling_logp_difference/mean": 6.160717964172363,
"step": 1429
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.30331704020500183,
"epoch": 3.763157894736842,
"grad_norm": 0.012537804432213306,
"learning_rate": 1e-06,
"loss": 0.1338,
"step": 1430
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.3093401789665222,
"epoch": 3.765789473684211,
"grad_norm": 0.014041773974895477,
"learning_rate": 1e-06,
"loss": 0.1339,
"step": 1431
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.29702627658843994,
"epoch": 3.768421052631579,
"grad_norm": 0.010763847269117832,
"learning_rate": 1e-06,
"loss": 0.2351,
"step": 1432
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08984375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 1706.0,
"completions/mean_length": 2045.2734375,
"completions/mean_terminated_length": 629.8626708984375,
"completions/min_length": 251.0,
"completions/min_terminated_length": 251.0,
"entropy": 0.32291169464588165,
"epoch": 3.7710526315789474,
"frac_reward_zero_std": 0.21875,
"grad_norm": 403.7364807128906,
"learning_rate": 1e-06,
"loss": 0.1425,
"num_tokens": 480464935.0,
"reward": 0.7736941576004028,
"reward_std": 0.2378438413143158,
"rewards/progression_diversity/mean": -0.03292561694979668,
"rewards/progression_diversity/std": 0.10725618153810501,
"rewards/symbolic_reward_accuracy/mean": 0.8515625,
"rewards/symbolic_reward_accuracy/std": 0.35588082671165466,
"rewards/symbolic_reward_partial_score/mean": 0.9016926884651184,
"rewards/symbolic_reward_partial_score/std": 0.265513151884079,
"rewards/tag_count_reward/mean": -0.07421875,
"rewards/tag_count_reward/std": 0.2623828947544098,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.034686803817749,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 332.0,
"sampling/sampling_logp_difference/mean": 5.4915852546691895,
"step": 1433
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.3211905360221863,
"epoch": 3.7736842105263158,
"grad_norm": 0.015537317842245102,
"learning_rate": 1e-06,
"loss": 0.1144,
"step": 1434
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.33275802433490753,
"epoch": 3.776315789473684,
"grad_norm": 0.014047092758119106,
"learning_rate": 1e-06,
"loss": 0.117,
"step": 1435
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.3044770658016205,
"epoch": 3.7789473684210524,
"grad_norm": 0.010970460250973701,
"learning_rate": 1e-06,
"loss": 0.1902,
"step": 1436
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12968.0,
"completions/mean_length": 3075.37890625,
"completions/mean_terminated_length": 683.5068969726562,
"completions/min_length": 249.0,
"completions/min_terminated_length": 249.0,
"entropy": 0.2942586690187454,
"epoch": 3.781578947368421,
"frac_reward_zero_std": 0.125,
"grad_norm": 349.7894287109375,
"learning_rate": 1e-06,
"loss": 0.1484,
"num_tokens": 482467017.0,
"reward": 0.6715598702430725,
"reward_std": 0.2547609806060791,
"rewards/progression_diversity/mean": -0.05104883760213852,
"rewards/progression_diversity/std": 0.12435862421989441,
"rewards/symbolic_reward_accuracy/mean": 0.732421875,
"rewards/symbolic_reward_accuracy/std": 0.4431293308734894,
"rewards/symbolic_reward_partial_score/mean": 0.8229166269302368,
"rewards/symbolic_reward_partial_score/std": 0.34593749046325684,
"rewards/tag_count_reward/mean": -0.142578125,
"rewards/tag_count_reward/std": 0.3499840497970581,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0357654094696045,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 332.0,
"sampling/sampling_logp_difference/mean": 5.477702617645264,
"step": 1437
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3671875,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5234375,
"entropy": 0.27393701672554016,
"epoch": 3.7842105263157895,
"grad_norm": 0.00826684758067131,
"learning_rate": 1e-06,
"loss": 0.2608,
"step": 1438
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.390625,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5546875,
"entropy": 0.3037470281124115,
"epoch": 3.786842105263158,
"grad_norm": 0.006643475033342838,
"learning_rate": 1e-06,
"loss": 0.1667,
"step": 1439
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5234375,
"entropy": 0.28216809034347534,
"epoch": 3.7894736842105265,
"grad_norm": 0.01340513676404953,
"learning_rate": 1e-06,
"loss": 0.2354,
"step": 1440
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3061.0,
"completions/mean_length": 1698.798828125,
"completions/mean_terminated_length": 588.1533813476562,
"completions/min_length": 237.0,
"completions/min_terminated_length": 237.0,
"entropy": 0.33806556463241577,
"epoch": 3.792105263157895,
"frac_reward_zero_std": 0.34375,
"grad_norm": 164.4355926513672,
"learning_rate": 1e-06,
"loss": 0.0763,
"num_tokens": 483743938.0,
"reward": 0.780125617980957,
"reward_std": 0.18881092965602875,
"rewards/progression_diversity/mean": -0.02455180510878563,
"rewards/progression_diversity/std": 0.09061013907194138,
"rewards/symbolic_reward_accuracy/mean": 0.849609375,
"rewards/symbolic_reward_accuracy/std": 0.35780346393585205,
"rewards/symbolic_reward_partial_score/mean": 0.9208984375,
"rewards/symbolic_reward_partial_score/std": 0.2368720918893814,
"rewards/tag_count_reward/mean": -0.056640625,
"rewards/tag_count_reward/std": 0.23138070106506348,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0537469387054443,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 332.0,
"sampling/sampling_logp_difference/mean": 3.5584030151367188,
"step": 1441
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.31358209252357483,
"epoch": 3.794736842105263,
"grad_norm": 0.010208331979811192,
"learning_rate": 1e-06,
"loss": 0.1884,
"step": 1442
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.3256896734237671,
"epoch": 3.7973684210526315,
"grad_norm": 0.05766785889863968,
"learning_rate": 1e-06,
"loss": 0.0753,
"step": 1443
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.32447347044944763,
"epoch": 3.8,
"grad_norm": 0.014500455930829048,
"learning_rate": 1e-06,
"loss": 0.1182,
"step": 1444
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09765625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2773.0,
"completions/mean_length": 2110.767578125,
"completions/mean_terminated_length": 566.0454711914062,
"completions/min_length": 239.0,
"completions/min_terminated_length": 239.0,
"entropy": 0.30462731420993805,
"epoch": 3.8026315789473686,
"frac_reward_zero_std": 0.21875,
"grad_norm": 307.380615234375,
"learning_rate": 1e-06,
"loss": 0.1169,
"num_tokens": 485208267.0,
"reward": 0.7762401700019836,
"reward_std": 0.18577754497528076,
"rewards/progression_diversity/mean": -0.032237473875284195,
"rewards/progression_diversity/std": 0.10042642056941986,
"rewards/symbolic_reward_accuracy/mean": 0.8515625,
"rewards/symbolic_reward_accuracy/std": 0.35588082671165466,
"rewards/symbolic_reward_partial_score/mean": 0.9088541269302368,
"rewards/symbolic_reward_partial_score/std": 0.2546987235546112,
"rewards/tag_count_reward/mean": -0.0703125,
"rewards/tag_count_reward/std": 0.25592297315597534,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.053719401359558,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 336.0,
"sampling/sampling_logp_difference/mean": 3.9605154991149902,
"step": 1445
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.3047986924648285,
"epoch": 3.805263157894737,
"grad_norm": 0.018491854891180992,
"learning_rate": 1e-06,
"loss": 0.1968,
"step": 1446
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.3292408138513565,
"epoch": 3.807894736842105,
"grad_norm": 2.424530267715454,
"learning_rate": 1e-06,
"loss": 0.0355,
"step": 1447
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.2958214730024338,
"epoch": 3.8105263157894735,
"grad_norm": 0.017129186540842056,
"learning_rate": 1e-06,
"loss": 0.2069,
"step": 1448
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16038.0,
"completions/mean_length": 2222.494140625,
"completions/mean_terminated_length": 621.6282348632812,
"completions/min_length": 244.0,
"completions/min_terminated_length": 244.0,
"entropy": 0.3045518100261688,
"epoch": 3.8131578947368423,
"frac_reward_zero_std": 0.25,
"grad_norm": 98.93772888183594,
"learning_rate": 1e-06,
"loss": 0.1575,
"num_tokens": 486730280.0,
"reward": 0.7752275466918945,
"reward_std": 0.1814824491739273,
"rewards/progression_diversity/mean": -0.03583987057209015,
"rewards/progression_diversity/std": 0.10934660583734512,
"rewards/symbolic_reward_accuracy/mean": 0.857421875,
"rewards/symbolic_reward_accuracy/std": 0.3499840497970581,
"rewards/symbolic_reward_partial_score/mean": 0.8990885615348816,
"rewards/symbolic_reward_partial_score/std": 0.27117884159088135,
"rewards/tag_count_reward/mean": -0.0859375,
"rewards/tag_count_reward/std": 0.28054583072662354,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0510573387145996,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 336.0,
"sampling/sampling_logp_difference/mean": 4.26395320892334,
"step": 1449
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.31291763484477997,
"epoch": 3.8157894736842106,
"grad_norm": 20.84035301208496,
"learning_rate": 1e-06,
"loss": 0.1215,
"step": 1450
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.30178968608379364,
"epoch": 3.818421052631579,
"grad_norm": 0.08378605544567108,
"learning_rate": 1e-06,
"loss": 0.1589,
"step": 1451
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.30265994369983673,
"epoch": 3.8210526315789473,
"grad_norm": 0.11986826360225677,
"learning_rate": 1e-06,
"loss": 0.1607,
"step": 1452
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.091796875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10509.0,
"completions/mean_length": 2041.810546875,
"completions/mean_terminated_length": 592.169921875,
"completions/min_length": 195.0,
"completions/min_terminated_length": 195.0,
"entropy": 0.3079294115304947,
"epoch": 3.8236842105263156,
"frac_reward_zero_std": 0.21875,
"grad_norm": 155.3240509033203,
"learning_rate": 1e-06,
"loss": 0.1091,
"num_tokens": 488187015.0,
"reward": 0.7332264184951782,
"reward_std": 0.21755467355251312,
"rewards/progression_diversity/mean": -0.03185119852423668,
"rewards/progression_diversity/std": 0.10352633893489838,
"rewards/symbolic_reward_accuracy/mean": 0.791015625,
"rewards/symbolic_reward_accuracy/std": 0.40698084235191345,
"rewards/symbolic_reward_partial_score/mean": 0.8904622197151184,
"rewards/symbolic_reward_partial_score/std": 0.26156580448150635,
"rewards/tag_count_reward/mean": -0.08203125,
"rewards/tag_count_reward/std": 0.2746807038784027,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0546609163284302,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 336.0,
"sampling/sampling_logp_difference/mean": 4.185214042663574,
"step": 1453
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.3079414367675781,
"epoch": 3.8263157894736843,
"grad_norm": 0.01617673970758915,
"learning_rate": 1e-06,
"loss": 0.1361,
"step": 1454
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4921875,
"entropy": 0.30259697139263153,
"epoch": 3.8289473684210527,
"grad_norm": 0.010684851557016373,
"learning_rate": 1e-06,
"loss": 0.1832,
"step": 1455
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.29774773120880127,
"epoch": 3.831578947368421,
"grad_norm": 0.026969242841005325,
"learning_rate": 1e-06,
"loss": 0.1849,
"step": 1456
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15120.0,
"completions/mean_length": 2334.595703125,
"completions/mean_terminated_length": 609.2302856445312,
"completions/min_length": 206.0,
"completions/min_terminated_length": 206.0,
"entropy": 0.28709380328655243,
"epoch": 3.8342105263157897,
"frac_reward_zero_std": 0.1875,
"grad_norm": 261.2757568359375,
"learning_rate": 1e-06,
"loss": 0.273,
"num_tokens": 489779800.0,
"reward": 0.741887092590332,
"reward_std": 0.2086125761270523,
"rewards/progression_diversity/mean": -0.039813414216041565,
"rewards/progression_diversity/std": 0.11546120047569275,
"rewards/symbolic_reward_accuracy/mean": 0.814453125,
"rewards/symbolic_reward_accuracy/std": 0.38912075757980347,
"rewards/symbolic_reward_partial_score/mean": 0.8785807490348816,
"rewards/symbolic_reward_partial_score/std": 0.2908572554588318,
"rewards/tag_count_reward/mean": -0.099609375,
"rewards/tag_count_reward/std": 0.29977133870124817,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0556657314300537,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 340.0,
"sampling/sampling_logp_difference/mean": 5.175328731536865,
"step": 1457
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.3067042976617813,
"epoch": 3.836842105263158,
"grad_norm": 0.011344866827130318,
"learning_rate": 1e-06,
"loss": 0.1358,
"step": 1458
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.3160387873649597,
"epoch": 3.8394736842105264,
"grad_norm": 2.8747458457946777,
"learning_rate": 1e-06,
"loss": 0.0778,
"step": 1459
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.46875,
"entropy": 0.2950311750173569,
"epoch": 3.8421052631578947,
"grad_norm": 0.005016846116632223,
"learning_rate": 1e-06,
"loss": 0.1839,
"step": 1460
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15092.0,
"completions/mean_length": 1931.142578125,
"completions/mean_terminated_length": 639.610595703125,
"completions/min_length": 243.0,
"completions/min_terminated_length": 243.0,
"entropy": 0.31487199664115906,
"epoch": 3.844736842105263,
"frac_reward_zero_std": 0.21875,
"grad_norm": 225.66839599609375,
"learning_rate": 1e-06,
"loss": 0.1183,
"num_tokens": 491169345.0,
"reward": 0.7955160140991211,
"reward_std": 0.22690117359161377,
"rewards/progression_diversity/mean": -0.033369071781635284,
"rewards/progression_diversity/std": 0.11018730700016022,
"rewards/symbolic_reward_accuracy/mean": 0.8828125,
"rewards/symbolic_reward_accuracy/std": 0.32195815443992615,
"rewards/symbolic_reward_partial_score/mean": 0.9132486581802368,
"rewards/symbolic_reward_partial_score/std": 0.2617155611515045,
"rewards/tag_count_reward/mean": -0.078125,
"rewards/tag_count_reward/std": 0.26863065361976624,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0568856000900269,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 340.0,
"sampling/sampling_logp_difference/mean": 4.69308614730835,
"step": 1461
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.31012292206287384,
"epoch": 3.8473684210526313,
"grad_norm": 0.009823931381106377,
"learning_rate": 1e-06,
"loss": 0.1637,
"step": 1462
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3359375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.2923455536365509,
"epoch": 3.85,
"grad_norm": 0.34986069798469543,
"learning_rate": 1e-06,
"loss": 0.2035,
"step": 1463
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4609375,
"entropy": 0.3000039905309677,
"epoch": 3.8526315789473684,
"grad_norm": 0.01767495833337307,
"learning_rate": 1e-06,
"loss": 0.1856,
"step": 1464
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.076171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9351.0,
"completions/mean_length": 1834.5859375,
"completions/mean_terminated_length": 634.9513549804688,
"completions/min_length": 209.0,
"completions/min_terminated_length": 209.0,
"entropy": 0.3126460313796997,
"epoch": 3.8552631578947367,
"frac_reward_zero_std": 0.28125,
"grad_norm": 1094.819580078125,
"learning_rate": 1e-06,
"loss": 0.1071,
"num_tokens": 492513805.0,
"reward": 0.763613224029541,
"reward_std": 0.17882443964481354,
"rewards/progression_diversity/mean": -0.03028068132698536,
"rewards/progression_diversity/std": 0.10648433864116669,
"rewards/symbolic_reward_accuracy/mean": 0.828125,
"rewards/symbolic_reward_accuracy/std": 0.3776407241821289,
"rewards/symbolic_reward_partial_score/mean": 0.9070637822151184,
"rewards/symbolic_reward_partial_score/std": 0.24986954033374786,
"rewards/tag_count_reward/mean": -0.05078125,
"rewards/tag_count_reward/std": 0.21976542472839355,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0567845106124878,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 340.0,
"sampling/sampling_logp_difference/mean": 4.487025260925293,
"step": 1465
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.2963457703590393,
"epoch": 3.8578947368421055,
"grad_norm": 551.8005981445312,
"learning_rate": 1e-06,
"loss": 0.2159,
"step": 1466
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.30269500613212585,
"epoch": 3.860526315789474,
"grad_norm": 30.8673152923584,
"learning_rate": 1e-06,
"loss": 0.1849,
"step": 1467
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.31897950172424316,
"epoch": 3.863157894736842,
"grad_norm": 0.007765746209770441,
"learning_rate": 1e-06,
"loss": 0.0966,
"step": 1468
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14826.0,
"completions/mean_length": 2125.513671875,
"completions/mean_terminated_length": 650.4978637695312,
"completions/min_length": 221.0,
"completions/min_terminated_length": 221.0,
"entropy": 0.2966301888227463,
"epoch": 3.8657894736842104,
"frac_reward_zero_std": 0.15625,
"grad_norm": 295.4943542480469,
"learning_rate": 1e-06,
"loss": 0.2116,
"num_tokens": 493999860.0,
"reward": 0.7747255563735962,
"reward_std": 0.2700809836387634,
"rewards/progression_diversity/mean": -0.04209459200501442,
"rewards/progression_diversity/std": 0.12887020409107208,
"rewards/symbolic_reward_accuracy/mean": 0.859375,
"rewards/symbolic_reward_accuracy/std": 0.3479743003845215,
"rewards/symbolic_reward_partial_score/mean": 0.89697265625,
"rewards/symbolic_reward_partial_score/std": 0.2792568802833557,
"rewards/tag_count_reward/mean": -0.095703125,
"rewards/tag_count_reward/std": 0.2944713830947876,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0543975830078125,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 342.0,
"sampling/sampling_logp_difference/mean": 4.8566389083862305,
"step": 1469
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.2987577021121979,
"epoch": 3.8684210526315788,
"grad_norm": 2.0050089359283447,
"learning_rate": 1e-06,
"loss": 0.125,
"step": 1470
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.31162601709365845,
"epoch": 3.8710526315789475,
"grad_norm": 0.4049341678619385,
"learning_rate": 1e-06,
"loss": 0.0939,
"step": 1471
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.27791038155555725,
"epoch": 3.873684210526316,
"grad_norm": 2.2584075927734375,
"learning_rate": 1e-06,
"loss": 0.2615,
"step": 1472
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.126953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3494.0,
"completions/mean_length": 2676.2734375,
"completions/mean_terminated_length": 682.9798583984375,
"completions/min_length": 209.0,
"completions/min_terminated_length": 209.0,
"entropy": 0.2749413251876831,
"epoch": 3.876315789473684,
"frac_reward_zero_std": 0.21875,
"grad_norm": 352.43115234375,
"learning_rate": 1e-06,
"loss": 0.1901,
"num_tokens": 495803296.0,
"reward": 0.6940611600875854,
"reward_std": 0.22690893709659576,
"rewards/progression_diversity/mean": -0.047005537897348404,
"rewards/progression_diversity/std": 0.13115794956684113,
"rewards/symbolic_reward_accuracy/mean": 0.75390625,
"rewards/symbolic_reward_accuracy/std": 0.4311550557613373,
"rewards/symbolic_reward_partial_score/mean": 0.845703125,
"rewards/symbolic_reward_partial_score/std": 0.31764960289001465,
"rewards/tag_count_reward/mean": -0.115234375,
"rewards/tag_count_reward/std": 0.3196168541908264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0539484024047852,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 342.0,
"sampling/sampling_logp_difference/mean": 4.197028636932373,
"step": 1473
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.2936839610338211,
"epoch": 3.8789473684210525,
"grad_norm": 0.012438047677278519,
"learning_rate": 1e-06,
"loss": 0.1267,
"step": 1474
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3515625,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4765625,
"entropy": 0.3024512231349945,
"epoch": 3.8815789473684212,
"grad_norm": 0.14518336951732635,
"learning_rate": 1e-06,
"loss": 0.1382,
"step": 1475
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.5,
"entropy": 0.2862910181283951,
"epoch": 3.8842105263157896,
"grad_norm": 0.021601000800728798,
"learning_rate": 1e-06,
"loss": 0.2073,
"step": 1476
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.072265625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5602.0,
"completions/mean_length": 1790.48828125,
"completions/mean_terminated_length": 653.73046875,
"completions/min_length": 204.0,
"completions/min_terminated_length": 204.0,
"entropy": 0.2987203449010849,
"epoch": 3.886842105263158,
"frac_reward_zero_std": 0.28125,
"grad_norm": 451.8078308105469,
"learning_rate": 1e-06,
"loss": 0.2304,
"num_tokens": 497139354.0,
"reward": 0.7696309089660645,
"reward_std": 0.18139323592185974,
"rewards/progression_diversity/mean": -0.024213135242462158,
"rewards/progression_diversity/std": 0.09224303066730499,
"rewards/symbolic_reward_accuracy/mean": 0.845703125,
"rewards/symbolic_reward_accuracy/std": 0.36158639192581177,
"rewards/symbolic_reward_partial_score/mean": 0.8963216543197632,
"rewards/symbolic_reward_partial_score/std": 0.2699549198150635,
"rewards/tag_count_reward/mean": -0.064453125,
"rewards/tag_count_reward/std": 0.24579854309558868,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.065273404121399,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 340.0,
"sampling/sampling_logp_difference/mean": 2.8104472160339355,
"step": 1477
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.3077242374420166,
"epoch": 3.889473684210526,
"grad_norm": 75.5800552368164,
"learning_rate": 1e-06,
"loss": 0.1149,
"step": 1478
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.32739219069480896,
"epoch": 3.8921052631578945,
"grad_norm": 0.022421207278966904,
"learning_rate": 1e-06,
"loss": 0.0204,
"step": 1479
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3359375,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4609375,
"entropy": 0.3064194321632385,
"epoch": 3.8947368421052633,
"grad_norm": 0.005432396661490202,
"learning_rate": 1e-06,
"loss": 0.1348,
"step": 1480
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.091796875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2170.0,
"completions/mean_length": 2075.45703125,
"completions/mean_terminated_length": 629.2172241210938,
"completions/min_length": 217.0,
"completions/min_terminated_length": 217.0,
"entropy": 0.29528728127479553,
"epoch": 3.8973684210526316,
"frac_reward_zero_std": 0.34375,
"grad_norm": 82.80768585205078,
"learning_rate": 1e-06,
"loss": 0.0844,
"num_tokens": 498619812.0,
"reward": 0.7649828791618347,
"reward_std": 0.20222607254981995,
"rewards/progression_diversity/mean": -0.034921687096357346,
"rewards/progression_diversity/std": 0.11383773386478424,
"rewards/symbolic_reward_accuracy/mean": 0.837890625,
"rewards/symbolic_reward_accuracy/std": 0.3689115643501282,
"rewards/symbolic_reward_partial_score/mean": 0.9026692509651184,
"rewards/symbolic_reward_partial_score/std": 0.26592451333999634,
"rewards/tag_count_reward/mean": -0.08203125,
"rewards/tag_count_reward/std": 0.2746807038784027,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0633716583251953,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 342.0,
"sampling/sampling_logp_difference/mean": 3.5869860649108887,
"step": 1481
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.3259361535310745,
"epoch": 3.9,
"grad_norm": 0.030284911394119263,
"learning_rate": 1e-06,
"loss": 0.0843,
"step": 1482
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.2944328784942627,
"epoch": 3.9026315789473687,
"grad_norm": 0.011786861345171928,
"learning_rate": 1e-06,
"loss": 0.2573,
"step": 1483
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.30443601310253143,
"epoch": 3.905263157894737,
"grad_norm": 0.02485356479883194,
"learning_rate": 1e-06,
"loss": 0.1844,
"step": 1484
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2390.0,
"completions/mean_length": 1519.84375,
"completions/mean_terminated_length": 594.6888427734375,
"completions/min_length": 220.0,
"completions/min_terminated_length": 220.0,
"entropy": 0.32855215668678284,
"epoch": 3.9078947368421053,
"frac_reward_zero_std": 0.3125,
"grad_norm": 51.70481491088867,
"learning_rate": 1e-06,
"loss": 0.1191,
"num_tokens": 499803124.0,
"reward": 0.8123822212219238,
"reward_std": 0.18835322558879852,
"rewards/progression_diversity/mean": -0.021549619734287262,
"rewards/progression_diversity/std": 0.08892330527305603,
"rewards/symbolic_reward_accuracy/mean": 0.8984375,
"rewards/symbolic_reward_accuracy/std": 0.30236753821372986,
"rewards/symbolic_reward_partial_score/mean": 0.9287109375,
"rewards/symbolic_reward_partial_score/std": 0.23190389573574066,
"rewards/tag_count_reward/mean": -0.05078125,
"rewards/tag_count_reward/std": 0.21976542472839355,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0671436786651611,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 342.0,
"sampling/sampling_logp_difference/mean": 3.182962417602539,
"step": 1485
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.30832575261592865,
"epoch": 3.9105263157894736,
"grad_norm": 0.009262947365641594,
"learning_rate": 1e-06,
"loss": 0.1818,
"step": 1486
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.3187596797943115,
"epoch": 3.913157894736842,
"grad_norm": 0.00602568918839097,
"learning_rate": 1e-06,
"loss": 0.1425,
"step": 1487
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.3167625367641449,
"epoch": 3.9157894736842103,
"grad_norm": 0.01127164252102375,
"learning_rate": 1e-06,
"loss": 0.1257,
"step": 1488
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2527.0,
"completions/mean_length": 1434.26953125,
"completions/mean_terminated_length": 569.4090576171875,
"completions/min_length": 226.0,
"completions/min_terminated_length": 226.0,
"entropy": 0.3199300616979599,
"epoch": 3.918421052631579,
"frac_reward_zero_std": 0.3125,
"grad_norm": 47.4603157043457,
"learning_rate": 1e-06,
"loss": 0.1406,
"num_tokens": 500914430.0,
"reward": 0.8060013055801392,
"reward_std": 0.2019416242837906,
"rewards/progression_diversity/mean": -0.019987143576145172,
"rewards/progression_diversity/std": 0.088285431265831,
"rewards/symbolic_reward_accuracy/mean": 0.88671875,
"rewards/symbolic_reward_accuracy/std": 0.3172462284564972,
"rewards/symbolic_reward_partial_score/mean": 0.9295247793197632,
"rewards/symbolic_reward_partial_score/std": 0.22341255843639374,
"rewards/tag_count_reward/mean": -0.046875,
"rewards/tag_count_reward/std": 0.21157780289649963,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0701146125793457,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 344.0,
"sampling/sampling_logp_difference/mean": 2.487426280975342,
"step": 1489
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.32358092069625854,
"epoch": 3.9210526315789473,
"grad_norm": 5.1159563064575195,
"learning_rate": 1e-06,
"loss": 0.074,
"step": 1490
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.3156203180551529,
"epoch": 3.9236842105263157,
"grad_norm": 19.548402786254883,
"learning_rate": 1e-06,
"loss": 0.0988,
"step": 1491
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.321828693151474,
"epoch": 3.9263157894736844,
"grad_norm": 95.19469451904297,
"learning_rate": 1e-06,
"loss": 0.1048,
"step": 1492
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.056640625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9638.0,
"completions/mean_length": 1489.11328125,
"completions/mean_terminated_length": 594.8033447265625,
"completions/min_length": 224.0,
"completions/min_terminated_length": 224.0,
"entropy": 0.31046292185783386,
"epoch": 3.9289473684210527,
"frac_reward_zero_std": 0.34375,
"grad_norm": 160.60206604003906,
"learning_rate": 1e-06,
"loss": 0.1243,
"num_tokens": 502064824.0,
"reward": 0.8175588250160217,
"reward_std": 0.18224866688251495,
"rewards/progression_diversity/mean": -0.02145996317267418,
"rewards/progression_diversity/std": 0.08906258642673492,
"rewards/symbolic_reward_accuracy/mean": 0.904296875,
"rewards/symbolic_reward_accuracy/std": 0.2944713830947876,
"rewards/symbolic_reward_partial_score/mean": 0.9342448115348816,
"rewards/symbolic_reward_partial_score/std": 0.2271050065755844,
"rewards/tag_count_reward/mean": -0.05078125,
"rewards/tag_count_reward/std": 0.21976542472839355,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0690562725067139,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 348.0,
"sampling/sampling_logp_difference/mean": 2.968571662902832,
"step": 1493
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.33640483021736145,
"epoch": 3.931578947368421,
"grad_norm": 0.021926378831267357,
"learning_rate": 1e-06,
"loss": 0.0626,
"step": 1494
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.29776348173618317,
"epoch": 3.9342105263157894,
"grad_norm": 0.009869229048490524,
"learning_rate": 1e-06,
"loss": 0.1971,
"step": 1495
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.310922309756279,
"epoch": 3.9368421052631577,
"grad_norm": 0.004013043362647295,
"learning_rate": 1e-06,
"loss": 0.0687,
"step": 1496
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.064453125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9479.0,
"completions/mean_length": 1596.375,
"completions/mean_terminated_length": 577.6033325195312,
"completions/min_length": 240.0,
"completions/min_terminated_length": 240.0,
"entropy": 0.32028135657310486,
"epoch": 3.9394736842105265,
"frac_reward_zero_std": 0.375,
"grad_norm": 302.3440856933594,
"learning_rate": 1e-06,
"loss": 0.1131,
"num_tokens": 503281304.0,
"reward": 0.7774491906166077,
"reward_std": 0.17322488129138947,
"rewards/progression_diversity/mean": -0.02363717183470726,
"rewards/progression_diversity/std": 0.09269430488348007,
"rewards/symbolic_reward_accuracy/mean": 0.8515625,
"rewards/symbolic_reward_accuracy/std": 0.35588082671165466,
"rewards/symbolic_reward_partial_score/mean": 0.91064453125,
"rewards/symbolic_reward_partial_score/std": 0.2503076493740082,
"rewards/tag_count_reward/mean": -0.064453125,
"rewards/tag_count_reward/std": 0.24579854309558868,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0702153444290161,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 350.0,
"sampling/sampling_logp_difference/mean": 2.403035879135132,
"step": 1497
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.31212155520915985,
"epoch": 3.942105263157895,
"grad_norm": 0.007395219057798386,
"learning_rate": 1e-06,
"loss": 0.108,
"step": 1498
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.30698293447494507,
"epoch": 3.944736842105263,
"grad_norm": 0.010542972013354301,
"learning_rate": 1e-06,
"loss": 0.1506,
"step": 1499
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.31951723992824554,
"epoch": 3.9473684210526314,
"grad_norm": 0.01814758963882923,
"learning_rate": 1e-06,
"loss": 0.0871,
"step": 1500
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9357.0,
"completions/mean_length": 1486.361328125,
"completions/mean_terminated_length": 624.5144653320312,
"completions/min_length": 204.0,
"completions/min_terminated_length": 204.0,
"entropy": 0.32599151134490967,
"epoch": 3.95,
"frac_reward_zero_std": 0.40625,
"grad_norm": 6.691247940063477,
"learning_rate": 1e-06,
"loss": 0.0441,
"num_tokens": 504458641.0,
"reward": 0.7741117477416992,
"reward_std": 0.17628371715545654,
"rewards/progression_diversity/mean": -0.02046554908156395,
"rewards/progression_diversity/std": 0.08719975501298904,
"rewards/symbolic_reward_accuracy/mean": 0.841796875,
"rewards/symbolic_reward_accuracy/std": 0.36528825759887695,
"rewards/symbolic_reward_partial_score/mean": 0.9143880605697632,
"rewards/symbolic_reward_partial_score/std": 0.23354806005954742,
"rewards/tag_count_reward/mean": -0.05078125,
"rewards/tag_count_reward/std": 0.21976542472839355,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0702029466629028,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 352.0,
"sampling/sampling_logp_difference/mean": 2.0351459980010986,
"step": 1501
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.32493776082992554,
"epoch": 3.9526315789473685,
"grad_norm": 1.328794240951538,
"learning_rate": 1e-06,
"loss": 0.0413,
"step": 1502
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.3139204680919647,
"epoch": 3.955263157894737,
"grad_norm": 0.007978091016411781,
"learning_rate": 1e-06,
"loss": 0.1326,
"step": 1503
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.3053445369005203,
"epoch": 3.957894736842105,
"grad_norm": 1.1158732175827026,
"learning_rate": 1e-06,
"loss": 0.1221,
"step": 1504
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.048828125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2461.0,
"completions/mean_length": 1384.337890625,
"completions/mean_terminated_length": 614.334716796875,
"completions/min_length": 197.0,
"completions/min_terminated_length": 197.0,
"entropy": 0.30869564414024353,
"epoch": 3.9605263157894735,
"frac_reward_zero_std": 0.4375,
"grad_norm": 96.13636016845703,
"learning_rate": 1e-06,
"loss": 0.1313,
"num_tokens": 505584222.0,
"reward": 0.8070886135101318,
"reward_std": 0.16022410988807678,
"rewards/progression_diversity/mean": -0.018680397421121597,
"rewards/progression_diversity/std": 0.0852426066994667,
"rewards/symbolic_reward_accuracy/mean": 0.88671875,
"rewards/symbolic_reward_accuracy/std": 0.3172462284564972,
"rewards/symbolic_reward_partial_score/mean": 0.9318033456802368,
"rewards/symbolic_reward_partial_score/std": 0.22796770930290222,
"rewards/tag_count_reward/mean": -0.04296875,
"rewards/tag_count_reward/std": 0.2029850035905838,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0674837827682495,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 348.0,
"sampling/sampling_logp_difference/mean": 2.188668727874756,
"step": 1505
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.3231370598077774,
"epoch": 3.963157894736842,
"grad_norm": 2.7390880584716797,
"learning_rate": 1e-06,
"loss": 0.0322,
"step": 1506
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.31320738792419434,
"epoch": 3.9657894736842105,
"grad_norm": 3.7349812984466553,
"learning_rate": 1e-06,
"loss": 0.1126,
"step": 1507
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.30574825406074524,
"epoch": 3.968421052631579,
"grad_norm": 0.009325175546109676,
"learning_rate": 1e-06,
"loss": 0.1404,
"step": 1508
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2576.0,
"completions/mean_length": 1660.708984375,
"completions/mean_terminated_length": 613.4456176757812,
"completions/min_length": 202.0,
"completions/min_terminated_length": 202.0,
"entropy": 0.32026034593582153,
"epoch": 3.9710526315789476,
"frac_reward_zero_std": 0.21875,
"grad_norm": 28.75910186767578,
"learning_rate": 1e-06,
"loss": 0.057,
"num_tokens": 506864681.0,
"reward": 0.7385144233703613,
"reward_std": 0.21706028282642365,
"rewards/progression_diversity/mean": -0.020627107471227646,
"rewards/progression_diversity/std": 0.08610701560974121,
"rewards/symbolic_reward_accuracy/mean": 0.791015625,
"rewards/symbolic_reward_accuracy/std": 0.40698084235191345,
"rewards/symbolic_reward_partial_score/mean": 0.9012044072151184,
"rewards/symbolic_reward_partial_score/std": 0.24362309277057648,
"rewards/tag_count_reward/mean": -0.0625,
"rewards/tag_count_reward/std": 0.2422981858253479,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.06502366065979,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 346.0,
"sampling/sampling_logp_difference/mean": 2.6898069381713867,
"step": 1509
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3359375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.3142033517360687,
"epoch": 3.973684210526316,
"grad_norm": 171.92105102539062,
"learning_rate": 1e-06,
"loss": 0.0856,
"step": 1510
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.28372959792613983,
"epoch": 3.9763157894736842,
"grad_norm": 300.263427734375,
"learning_rate": 1e-06,
"loss": 0.1618,
"step": 1511
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.3046766221523285,
"epoch": 3.9789473684210526,
"grad_norm": 0.013155936263501644,
"learning_rate": 1e-06,
"loss": 0.1099,
"step": 1512
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.052734375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2665.0,
"completions/mean_length": 1467.025390625,
"completions/mean_terminated_length": 636.5958862304688,
"completions/min_length": 221.0,
"completions/min_terminated_length": 221.0,
"entropy": 0.3126714378595352,
"epoch": 3.981578947368421,
"frac_reward_zero_std": 0.34375,
"grad_norm": 230.4984588623047,
"learning_rate": 1e-06,
"loss": 0.1214,
"num_tokens": 508025942.0,
"reward": 0.7970232963562012,
"reward_std": 0.1775650829076767,
"rewards/progression_diversity/mean": -0.019354067742824554,
"rewards/progression_diversity/std": 0.08400935679674149,
"rewards/symbolic_reward_accuracy/mean": 0.873046875,
"rewards/symbolic_reward_accuracy/std": 0.33324605226516724,
"rewards/symbolic_reward_partial_score/mean": 0.9275715947151184,
"rewards/symbolic_reward_partial_score/std": 0.22987112402915955,
"rewards/tag_count_reward/mean": -0.048828125,
"rewards/tag_count_reward/std": 0.2157193273305893,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0662436485290527,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 348.0,
"sampling/sampling_logp_difference/mean": 2.509793281555176,
"step": 1513
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.31753115355968475,
"epoch": 3.984210526315789,
"grad_norm": 0.017170730978250504,
"learning_rate": 1e-06,
"loss": 0.1007,
"step": 1514
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.2985118627548218,
"epoch": 3.986842105263158,
"grad_norm": 0.0552213080227375,
"learning_rate": 1e-06,
"loss": 0.127,
"step": 1515
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.31124047935009,
"epoch": 3.9894736842105263,
"grad_norm": 0.018329912796616554,
"learning_rate": 1e-06,
"loss": 0.0722,
"step": 1516
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.083984375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2271.0,
"completions/mean_length": 1958.474609375,
"completions/mean_terminated_length": 635.8784790039062,
"completions/min_length": 227.0,
"completions/min_terminated_length": 227.0,
"entropy": 0.285673588514328,
"epoch": 3.9921052631578946,
"frac_reward_zero_std": 0.28125,
"grad_norm": 261.71087646484375,
"learning_rate": 1e-06,
"loss": 0.2431,
"num_tokens": 509438505.0,
"reward": 0.7900243997573853,
"reward_std": 0.1837175190448761,
"rewards/progression_diversity/mean": -0.03076472133398056,
"rewards/progression_diversity/std": 0.10606426000595093,
"rewards/symbolic_reward_accuracy/mean": 0.875,
"rewards/symbolic_reward_accuracy/std": 0.3310423493385315,
"rewards/symbolic_reward_partial_score/mean": 0.9117838144302368,
"rewards/symbolic_reward_partial_score/std": 0.26197776198387146,
"rewards/tag_count_reward/mean": -0.08203125,
"rewards/tag_count_reward/std": 0.2746807038784027,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0607240200042725,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 348.0,
"sampling/sampling_logp_difference/mean": 3.408790111541748,
"step": 1517
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.3019895553588867,
"epoch": 3.9947368421052634,
"grad_norm": 476.9878845214844,
"learning_rate": 1e-06,
"loss": 0.1528,
"step": 1518
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.29304228723049164,
"epoch": 3.9973684210526317,
"grad_norm": 0.3619004487991333,
"learning_rate": 1e-06,
"loss": 0.1619,
"step": 1519
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.3019654005765915,
"epoch": 4.0,
"grad_norm": 0.011948428116738796,
"learning_rate": 1e-06,
"loss": 0.1182,
"step": 1520
},
{
"epoch": 4.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.034423828125,
"eval_completions/max_length": 15897.96875,
"eval_completions/max_terminated_length": 1909.625,
"eval_completions/mean_length": 1054.68017578125,
"eval_completions/mean_terminated_length": 508.42947006225586,
"eval_completions/min_length": 225.25,
"eval_completions/min_terminated_length": 225.25,
"eval_entropy": 0.3078602785244584,
"eval_frac_reward_zero_std": 0.30078125,
"eval_loss": 0.03949446976184845,
"eval_num_tokens": 509438505.0,
"eval_reward": 0.8282011039555073,
"eval_reward_std": 0.17971097235567868,
"eval_rewards/progression_diversity/mean": -0.013755144311289769,
"eval_rewards/progression_diversity/std": 0.07045977615052834,
"eval_rewards/symbolic_reward_accuracy/mean": 0.91259765625,
"eval_rewards/symbolic_reward_accuracy/std": 0.2780101113021374,
"eval_rewards/symbolic_reward_partial_score/mean": 0.9448852557688951,
"eval_rewards/symbolic_reward_partial_score/std": 0.1947934958152473,
"eval_rewards/tag_count_reward/mean": -0.02685546875,
"eval_rewards/tag_count_reward/std": 0.15172951598651707,
"eval_runtime": 3921.274,
"eval_samples_per_second": 0.064,
"eval_sampling/importance_sampling_ratio/max": 2.0,
"eval_sampling/importance_sampling_ratio/mean": 1.0713170282542706,
"eval_sampling/importance_sampling_ratio/min": 2.523935052067827e-07,
"eval_sampling/sampling_logp_difference/max": 336.4912216961384,
"eval_sampling/sampling_logp_difference/mean": 1.0463141361251473,
"eval_steps_per_second": 0.001,
"step": 1520
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.099609375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2840.0,
"completions/mean_length": 2191.412109375,
"completions/mean_terminated_length": 621.2993774414062,
"completions/min_length": 256.0,
"completions/min_terminated_length": 256.0,
"entropy": 0.27799971401691437,
"epoch": 4.002631578947368,
"frac_reward_zero_std": 0.28125,
"grad_norm": 205.81446838378906,
"learning_rate": 1e-06,
"loss": 0.1941,
"num_tokens": 510976988.0,
"reward": 0.755181074142456,
"reward_std": 0.1851288080215454,
"rewards/progression_diversity/mean": -0.038541071116924286,
"rewards/progression_diversity/std": 0.1202431246638298,
"rewards/symbolic_reward_accuracy/mean": 0.833984375,
"rewards/symbolic_reward_accuracy/std": 0.3724585771560669,
"rewards/symbolic_reward_partial_score/mean": 0.8818359375,
"rewards/symbolic_reward_partial_score/std": 0.2894873321056366,
"rewards/tag_count_reward/mean": -0.09375,
"rewards/tag_count_reward/std": 0.29176566004753113,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.055989146232605,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 348.0,
"sampling/sampling_logp_difference/mean": 4.068850994110107,
"step": 1521
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.30569666624069214,
"epoch": 4.005263157894737,
"grad_norm": 0.4432198405265808,
"learning_rate": 1e-06,
"loss": 0.0885,
"step": 1522
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.28277716040611267,
"epoch": 4.007894736842105,
"grad_norm": 0.008685186505317688,
"learning_rate": 1e-06,
"loss": 0.1703,
"step": 1523
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.2912045568227768,
"epoch": 4.010526315789473,
"grad_norm": 0.018372351303696632,
"learning_rate": 1e-06,
"loss": 0.1348,
"step": 1524
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2278.0,
"completions/mean_length": 1152.287109375,
"completions/mean_terminated_length": 565.263671875,
"completions/min_length": 218.0,
"completions/min_terminated_length": 218.0,
"entropy": 0.32067057490348816,
"epoch": 4.0131578947368425,
"frac_reward_zero_std": 0.5,
"grad_norm": 151.9108428955078,
"learning_rate": 1e-06,
"loss": 0.1036,
"num_tokens": 511949071.0,
"reward": 0.8358957767486572,
"reward_std": 0.15225106477737427,
"rewards/progression_diversity/mean": -0.013942432589828968,
"rewards/progression_diversity/std": 0.07440164685249329,
"rewards/symbolic_reward_accuracy/mean": 0.923828125,
"rewards/symbolic_reward_accuracy/std": 0.26553234457969666,
"rewards/symbolic_reward_partial_score/mean": 0.9501953125,
"rewards/symbolic_reward_partial_score/std": 0.1953859031200409,
"rewards/tag_count_reward/mean": -0.033203125,
"rewards/tag_count_reward/std": 0.17934183776378632,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0619966983795166,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 348.0,
"sampling/sampling_logp_difference/mean": 2.296659469604492,
"step": 1525
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.3325800895690918,
"epoch": 4.015789473684211,
"grad_norm": 0.02713238075375557,
"learning_rate": 1e-06,
"loss": 0.0424,
"step": 1526
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.3067079782485962,
"epoch": 4.018421052631579,
"grad_norm": 0.009789610281586647,
"learning_rate": 1e-06,
"loss": 0.1041,
"step": 1527
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.3153266906738281,
"epoch": 4.021052631578947,
"grad_norm": 0.003507691202685237,
"learning_rate": 1e-06,
"loss": 0.0967,
"step": 1528
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2340.0,
"completions/mean_length": 1278.15234375,
"completions/mean_terminated_length": 599.9306030273438,
"completions/min_length": 239.0,
"completions/min_terminated_length": 239.0,
"entropy": 0.3092862367630005,
"epoch": 4.023684210526316,
"frac_reward_zero_std": 0.375,
"grad_norm": 305.3227233886719,
"learning_rate": 1e-06,
"loss": 0.0672,
"num_tokens": 513002301.0,
"reward": 0.7656987905502319,
"reward_std": 0.17219382524490356,
"rewards/progression_diversity/mean": -0.017035705968737602,
"rewards/progression_diversity/std": 0.08615429699420929,
"rewards/symbolic_reward_accuracy/mean": 0.81640625,
"rewards/symbolic_reward_accuracy/std": 0.3875311613082886,
"rewards/symbolic_reward_partial_score/mean": 0.93115234375,
"rewards/symbolic_reward_partial_score/std": 0.20142759382724762,
"rewards/tag_count_reward/mean": -0.033203125,
"rewards/tag_count_reward/std": 0.17934183776378632,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0569286346435547,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 348.0,
"sampling/sampling_logp_difference/mean": 2.1248581409454346,
"step": 1529
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.3113991618156433,
"epoch": 4.026315789473684,
"grad_norm": 58.646968841552734,
"learning_rate": 1e-06,
"loss": 0.1007,
"step": 1530
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.3031221777200699,
"epoch": 4.028947368421052,
"grad_norm": 0.011349079199135303,
"learning_rate": 1e-06,
"loss": 0.1034,
"step": 1531
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.32167190313339233,
"epoch": 4.031578947368421,
"grad_norm": 0.010632401332259178,
"learning_rate": 1e-06,
"loss": 0.0584,
"step": 1532
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2379.0,
"completions/mean_length": 1502.974609375,
"completions/mean_terminated_length": 576.7697143554688,
"completions/min_length": 204.0,
"completions/min_terminated_length": 204.0,
"entropy": 0.3021652102470398,
"epoch": 4.03421052631579,
"frac_reward_zero_std": 0.46875,
"grad_norm": 165.5340576171875,
"learning_rate": 1e-06,
"loss": 0.1347,
"num_tokens": 514159952.0,
"reward": 0.8022516965866089,
"reward_std": 0.11929187178611755,
"rewards/progression_diversity/mean": -0.023855865001678467,
"rewards/progression_diversity/std": 0.0998440608382225,
"rewards/symbolic_reward_accuracy/mean": 0.8828125,
"rewards/symbolic_reward_accuracy/std": 0.32195815443992615,
"rewards/symbolic_reward_partial_score/mean": 0.9249674081802368,
"rewards/symbolic_reward_partial_score/std": 0.23110011219978333,
"rewards/tag_count_reward/mean": -0.046875,
"rewards/tag_count_reward/std": 0.21157780289649963,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0544638633728027,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 348.0,
"sampling/sampling_logp_difference/mean": 2.4234862327575684,
"step": 1533
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.3111570328474045,
"epoch": 4.036842105263158,
"grad_norm": 0.01784106157720089,
"learning_rate": 1e-06,
"loss": 0.1066,
"step": 1534
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.3170369863510132,
"epoch": 4.0394736842105265,
"grad_norm": 3.104100227355957,
"learning_rate": 1e-06,
"loss": 0.0767,
"step": 1535
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.3106560707092285,
"epoch": 4.042105263157895,
"grad_norm": 0.010589975863695145,
"learning_rate": 1e-06,
"loss": 0.1156,
"step": 1536
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2955.0,
"completions/mean_length": 909.375,
"completions/mean_terminated_length": 537.9840087890625,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"entropy": 0.33030684292316437,
"epoch": 4.044736842105263,
"frac_reward_zero_std": 0.4375,
"grad_norm": 51.92629623413086,
"learning_rate": 1e-06,
"loss": 0.0195,
"num_tokens": 515009328.0,
"reward": 0.8129271268844604,
"reward_std": 0.14900250732898712,
"rewards/progression_diversity/mean": -0.011006257496774197,
"rewards/progression_diversity/std": 0.07234536856412888,
"rewards/symbolic_reward_accuracy/mean": 0.8828125,
"rewards/symbolic_reward_accuracy/std": 0.32195815443992615,
"rewards/symbolic_reward_partial_score/mean": 0.9510090947151184,
"rewards/symbolic_reward_partial_score/std": 0.1690969616174698,
"rewards/tag_count_reward/mean": -0.01953125,
"rewards/tag_count_reward/std": 0.1385180652141571,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0632414817810059,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 352.0,
"sampling/sampling_logp_difference/mean": 1.5446823835372925,
"step": 1537
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.32397404313087463,
"epoch": 4.0473684210526315,
"grad_norm": 0.022417036816477776,
"learning_rate": 1e-06,
"loss": 0.0486,
"step": 1538
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.3341270387172699,
"epoch": 4.05,
"grad_norm": 0.015315905213356018,
"learning_rate": 1e-06,
"loss": 0.0331,
"step": 1539
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.31841354072093964,
"epoch": 4.052631578947368,
"grad_norm": 61.291114807128906,
"learning_rate": 1e-06,
"loss": 0.0999,
"step": 1540
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.052734375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2456.0,
"completions/mean_length": 1415.52734375,
"completions/mean_terminated_length": 582.23095703125,
"completions/min_length": 196.0,
"completions/min_terminated_length": 196.0,
"entropy": 0.31011515855789185,
"epoch": 4.0552631578947365,
"frac_reward_zero_std": 0.40625,
"grad_norm": 137.6245574951172,
"learning_rate": 1e-06,
"loss": 0.0506,
"num_tokens": 516117534.0,
"reward": 0.7666909694671631,
"reward_std": 0.1560915857553482,
"rewards/progression_diversity/mean": -0.02036316879093647,
"rewards/progression_diversity/std": 0.09054167568683624,
"rewards/symbolic_reward_accuracy/mean": 0.83203125,
"rewards/symbolic_reward_accuracy/std": 0.374204158782959,
"rewards/symbolic_reward_partial_score/mean": 0.9065755009651184,
"rewards/symbolic_reward_partial_score/std": 0.24309590458869934,
"rewards/tag_count_reward/mean": -0.04296875,
"rewards/tag_count_reward/std": 0.2029850035905838,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.058464765548706,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 352.0,
"sampling/sampling_logp_difference/mean": 2.0097904205322266,
"step": 1541
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.32095426321029663,
"epoch": 4.057894736842106,
"grad_norm": 0.008144257590174675,
"learning_rate": 1e-06,
"loss": 0.0401,
"step": 1542
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.30426955223083496,
"epoch": 4.060526315789474,
"grad_norm": 0.016386935487389565,
"learning_rate": 1e-06,
"loss": 0.119,
"step": 1543
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.3272694945335388,
"epoch": 4.063157894736842,
"grad_norm": 39.390586853027344,
"learning_rate": 1e-06,
"loss": 0.0768,
"step": 1544
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.068359375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3431.0,
"completions/mean_length": 1677.0234375,
"completions/mean_terminated_length": 597.8951416015625,
"completions/min_length": 235.0,
"completions/min_terminated_length": 235.0,
"entropy": 0.3231520652770996,
"epoch": 4.065789473684211,
"frac_reward_zero_std": 0.375,
"grad_norm": 80.80924987792969,
"learning_rate": 1e-06,
"loss": 0.0269,
"num_tokens": 517367626.0,
"reward": 0.7787384390830994,
"reward_std": 0.16898325085639954,
"rewards/progression_diversity/mean": -0.02655177377164364,
"rewards/progression_diversity/std": 0.1030859723687172,
"rewards/symbolic_reward_accuracy/mean": 0.849609375,
"rewards/symbolic_reward_accuracy/std": 0.35780346393585205,
"rewards/symbolic_reward_partial_score/mean": 0.9169921875,
"rewards/symbolic_reward_partial_score/std": 0.24586138129234314,
"rewards/tag_count_reward/mean": -0.05859375,
"rewards/tag_count_reward/std": 0.23509246110916138,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0506144762039185,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 352.0,
"sampling/sampling_logp_difference/mean": 4.009866714477539,
"step": 1545
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.3029029667377472,
"epoch": 4.068421052631579,
"grad_norm": 0.010388639755547047,
"learning_rate": 1e-06,
"loss": 0.1362,
"step": 1546
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.3171136975288391,
"epoch": 4.071052631578947,
"grad_norm": 0.00802148599177599,
"learning_rate": 1e-06,
"loss": 0.06,
"step": 1547
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.28475718200206757,
"epoch": 4.073684210526316,
"grad_norm": 0.005422931630164385,
"learning_rate": 1e-06,
"loss": 0.291,
"step": 1548
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2800.0,
"completions/mean_length": 1218.509765625,
"completions/mean_terminated_length": 665.9210815429688,
"completions/min_length": 198.0,
"completions/min_terminated_length": 198.0,
"entropy": 0.32124534249305725,
"epoch": 4.076315789473684,
"frac_reward_zero_std": 0.3125,
"grad_norm": 359.06634521484375,
"learning_rate": 1e-06,
"loss": 0.0374,
"num_tokens": 518415663.0,
"reward": 0.8058355450630188,
"reward_std": 0.17662674188613892,
"rewards/progression_diversity/mean": -0.012153420597314835,
"rewards/progression_diversity/std": 0.06827174872159958,
"rewards/symbolic_reward_accuracy/mean": 0.87890625,
"rewards/symbolic_reward_accuracy/std": 0.3265552520751953,
"rewards/symbolic_reward_partial_score/mean": 0.9397786259651184,
"rewards/symbolic_reward_partial_score/std": 0.19970273971557617,
"rewards/tag_count_reward/mean": -0.033203125,
"rewards/tag_count_reward/std": 0.17934183776378632,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0600073337554932,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 352.0,
"sampling/sampling_logp_difference/mean": 2.2380099296569824,
"step": 1549
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.29867467284202576,
"epoch": 4.078947368421052,
"grad_norm": 0.015004157088696957,
"learning_rate": 1e-06,
"loss": 0.1029,
"step": 1550
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.3169206827878952,
"epoch": 4.081578947368421,
"grad_norm": 0.020958561450242996,
"learning_rate": 1e-06,
"loss": 0.0588,
"step": 1551
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.3124808520078659,
"epoch": 4.08421052631579,
"grad_norm": 0.011239241808652878,
"learning_rate": 1e-06,
"loss": 0.0997,
"step": 1552
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3201.0,
"completions/mean_length": 1185.564453125,
"completions/mean_terminated_length": 599.823486328125,
"completions/min_length": 216.0,
"completions/min_terminated_length": 216.0,
"entropy": 0.315788209438324,
"epoch": 4.086842105263158,
"frac_reward_zero_std": 0.4375,
"grad_norm": 902.8833618164062,
"learning_rate": 1e-06,
"loss": 0.1105,
"num_tokens": 519423472.0,
"reward": 0.816677451133728,
"reward_std": 0.13706813752651215,
"rewards/progression_diversity/mean": -0.011946848593652248,
"rewards/progression_diversity/std": 0.0664309710264206,
"rewards/symbolic_reward_accuracy/mean": 0.89453125,
"rewards/symbolic_reward_accuracy/std": 0.3074568510055542,
"rewards/symbolic_reward_partial_score/mean": 0.9440103769302368,
"rewards/symbolic_reward_partial_score/std": 0.19122956693172455,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0609365701675415,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 352.0,
"sampling/sampling_logp_difference/mean": 2.0578854084014893,
"step": 1553
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.3234410434961319,
"epoch": 4.089473684210526,
"grad_norm": 0.014460497535765171,
"learning_rate": 1e-06,
"loss": 0.0497,
"step": 1554
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.31574487686157227,
"epoch": 4.092105263157895,
"grad_norm": 0.00562589755281806,
"learning_rate": 1e-06,
"loss": 0.0687,
"step": 1555
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.30954110622406006,
"epoch": 4.094736842105263,
"grad_norm": 0.011875905096530914,
"learning_rate": 1e-06,
"loss": 0.0327,
"step": 1556
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.041015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3258.0,
"completions/mean_length": 1236.6171875,
"completions/mean_terminated_length": 588.7658081054688,
"completions/min_length": 217.0,
"completions/min_terminated_length": 217.0,
"entropy": 0.317048043012619,
"epoch": 4.097368421052631,
"frac_reward_zero_std": 0.46875,
"grad_norm": 72.20258331298828,
"learning_rate": 1e-06,
"loss": 0.0398,
"num_tokens": 520449900.0,
"reward": 0.8271297216415405,
"reward_std": 0.1219562292098999,
"rewards/progression_diversity/mean": -0.016525140032172203,
"rewards/progression_diversity/std": 0.0804808959364891,
"rewards/symbolic_reward_accuracy/mean": 0.91015625,
"rewards/symbolic_reward_accuracy/std": 0.2862374484539032,
"rewards/symbolic_reward_partial_score/mean": 0.9503580331802368,
"rewards/symbolic_reward_partial_score/std": 0.1865689754486084,
"rewards/tag_count_reward/mean": -0.0390625,
"rewards/tag_count_reward/std": 0.1939331740140915,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0606681108474731,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 352.0,
"sampling/sampling_logp_difference/mean": 2.4164552688598633,
"step": 1557
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.3193093240261078,
"epoch": 4.1,
"grad_norm": 0.008888328447937965,
"learning_rate": 1e-06,
"loss": 0.0658,
"step": 1558
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.3196675330400467,
"epoch": 4.102631578947369,
"grad_norm": 0.007627990562468767,
"learning_rate": 1e-06,
"loss": 0.0829,
"step": 1559
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.31077609956264496,
"epoch": 4.105263157894737,
"grad_norm": 0.011663636192679405,
"learning_rate": 1e-06,
"loss": 0.1007,
"step": 1560
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8140.0,
"completions/mean_length": 1069.8125,
"completions/mean_terminated_length": 607.6136474609375,
"completions/min_length": 212.0,
"completions/min_terminated_length": 212.0,
"entropy": 0.34023210406303406,
"epoch": 4.1078947368421055,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.020510073751211166,
"learning_rate": 1e-06,
"loss": 0.0128,
"num_tokens": 521379756.0,
"reward": 0.8407922387123108,
"reward_std": 0.14206728339195251,
"rewards/progression_diversity/mean": -0.012575984001159668,
"rewards/progression_diversity/std": 0.07108542323112488,
"rewards/symbolic_reward_accuracy/mean": 0.92578125,
"rewards/symbolic_reward_accuracy/std": 0.2623828947544098,
"rewards/symbolic_reward_partial_score/mean": 0.9606119394302368,
"rewards/symbolic_reward_partial_score/std": 0.1664878875017166,
"rewards/tag_count_reward/mean": -0.02734375,
"rewards/tag_count_reward/std": 0.16324250400066376,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0578272342681885,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 350.0,
"sampling/sampling_logp_difference/mean": 2.6809022426605225,
"step": 1561
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.31028036773204803,
"epoch": 4.110526315789474,
"grad_norm": 0.0143095962703228,
"learning_rate": 1e-06,
"loss": 0.1249,
"step": 1562
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.30853691697120667,
"epoch": 4.113157894736842,
"grad_norm": 0.05083174258470535,
"learning_rate": 1e-06,
"loss": 0.1291,
"step": 1563
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.32772916555404663,
"epoch": 4.11578947368421,
"grad_norm": 0.020028606057167053,
"learning_rate": 1e-06,
"loss": 0.0466,
"step": 1564
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15382.0,
"completions/mean_length": 1305.990234375,
"completions/mean_terminated_length": 693.06298828125,
"completions/min_length": 245.0,
"completions/min_terminated_length": 245.0,
"entropy": 0.30608853697776794,
"epoch": 4.118421052631579,
"frac_reward_zero_std": 0.4375,
"grad_norm": 67.87190246582031,
"learning_rate": 1e-06,
"loss": 0.0841,
"num_tokens": 522466247.0,
"reward": 0.8353767395019531,
"reward_std": 0.13427817821502686,
"rewards/progression_diversity/mean": -0.017013559117913246,
"rewards/progression_diversity/std": 0.08219697326421738,
"rewards/symbolic_reward_accuracy/mean": 0.921875,
"rewards/symbolic_reward_accuracy/std": 0.26863065361976624,
"rewards/symbolic_reward_partial_score/mean": 0.9505208134651184,
"rewards/symbolic_reward_partial_score/std": 0.19484204053878784,
"rewards/tag_count_reward/mean": -0.02734375,
"rewards/tag_count_reward/std": 0.16324250400066376,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0560702085494995,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 352.0,
"sampling/sampling_logp_difference/mean": 2.4243669509887695,
"step": 1565
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.29687653481960297,
"epoch": 4.121052631578947,
"grad_norm": 0.006236726883798838,
"learning_rate": 1e-06,
"loss": 0.1279,
"step": 1566
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.3095232844352722,
"epoch": 4.123684210526315,
"grad_norm": 0.002965538529679179,
"learning_rate": 1e-06,
"loss": 0.0331,
"step": 1567
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.313509002327919,
"epoch": 4.126315789473685,
"grad_norm": 0.018090499565005302,
"learning_rate": 1e-06,
"loss": 0.0931,
"step": 1568
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.048828125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3464.0,
"completions/mean_length": 1349.125,
"completions/mean_terminated_length": 577.314208984375,
"completions/min_length": 246.0,
"completions/min_terminated_length": 246.0,
"entropy": 0.2996673285961151,
"epoch": 4.128947368421053,
"frac_reward_zero_std": 0.46875,
"grad_norm": 218.44454956054688,
"learning_rate": 1e-06,
"loss": 0.0891,
"num_tokens": 523526119.0,
"reward": 0.8193434476852417,
"reward_std": 0.13413912057876587,
"rewards/progression_diversity/mean": -0.018784930929541588,
"rewards/progression_diversity/std": 0.0878358706831932,
"rewards/symbolic_reward_accuracy/mean": 0.904296875,
"rewards/symbolic_reward_accuracy/std": 0.2944713830947876,
"rewards/symbolic_reward_partial_score/mean": 0.9388021230697632,
"rewards/symbolic_reward_partial_score/std": 0.21514074504375458,
"rewards/tag_count_reward/mean": -0.046875,
"rewards/tag_count_reward/std": 0.21157780289649963,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.054738998413086,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 352.0,
"sampling/sampling_logp_difference/mean": 2.5745482444763184,
"step": 1569
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.31027545034885406,
"epoch": 4.131578947368421,
"grad_norm": 0.004378543235361576,
"learning_rate": 1e-06,
"loss": 0.0894,
"step": 1570
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.31161196529865265,
"epoch": 4.13421052631579,
"grad_norm": 0.005518985912203789,
"learning_rate": 1e-06,
"loss": 0.1057,
"step": 1571
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.322713166475296,
"epoch": 4.136842105263158,
"grad_norm": 0.005724793300032616,
"learning_rate": 1e-06,
"loss": 0.023,
"step": 1572
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3315.0,
"completions/mean_length": 1232.330078125,
"completions/mean_terminated_length": 616.4085083007812,
"completions/min_length": 236.0,
"completions/min_terminated_length": 236.0,
"entropy": 0.31801025569438934,
"epoch": 4.139473684210526,
"frac_reward_zero_std": 0.5,
"grad_norm": 114.2596664428711,
"learning_rate": 1e-06,
"loss": 0.0698,
"num_tokens": 524540528.0,
"reward": 0.8093826770782471,
"reward_std": 0.1396641731262207,
"rewards/progression_diversity/mean": -0.013879730366170406,
"rewards/progression_diversity/std": 0.07435765862464905,
"rewards/symbolic_reward_accuracy/mean": 0.8828125,
"rewards/symbolic_reward_accuracy/std": 0.32195815443992615,
"rewards/symbolic_reward_partial_score/mean": 0.9444987177848816,
"rewards/symbolic_reward_partial_score/std": 0.1908387839794159,
"rewards/tag_count_reward/mean": -0.03515625,
"rewards/tag_count_reward/std": 0.1843547374010086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.054489254951477,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 352.0,
"sampling/sampling_logp_difference/mean": 2.705132484436035,
"step": 1573
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.31481173634529114,
"epoch": 4.1421052631578945,
"grad_norm": 0.011268100701272488,
"learning_rate": 1e-06,
"loss": 0.0299,
"step": 1574
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.30692553520202637,
"epoch": 4.144736842105263,
"grad_norm": 0.013008923269808292,
"learning_rate": 1e-06,
"loss": 0.1172,
"step": 1575
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.30435749888420105,
"epoch": 4.147368421052631,
"grad_norm": 0.016314802691340446,
"learning_rate": 1e-06,
"loss": 0.1019,
"step": 1576
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2267.0,
"completions/mean_length": 1044.396484375,
"completions/mean_terminated_length": 644.7675170898438,
"completions/min_length": 227.0,
"completions/min_terminated_length": 227.0,
"entropy": 0.329788476228714,
"epoch": 4.15,
"frac_reward_zero_std": 0.46875,
"grad_norm": 0.022590087726712227,
"learning_rate": 1e-06,
"loss": 0.0132,
"num_tokens": 525490075.0,
"reward": 0.8446779847145081,
"reward_std": 0.13736554980278015,
"rewards/progression_diversity/mean": -0.009744609706103802,
"rewards/progression_diversity/std": 0.06334702670574188,
"rewards/symbolic_reward_accuracy/mean": 0.931640625,
"rewards/symbolic_reward_accuracy/std": 0.25260838866233826,
"rewards/symbolic_reward_partial_score/mean": 0.9611002206802368,
"rewards/symbolic_reward_partial_score/std": 0.16516920924186707,
"rewards/tag_count_reward/mean": -0.025390625,
"rewards/tag_count_reward/std": 0.15746226906776428,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0595576763153076,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 352.0,
"sampling/sampling_logp_difference/mean": 1.8629345893859863,
"step": 1577
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.310794472694397,
"epoch": 4.152631578947369,
"grad_norm": 0.0059421774931252,
"learning_rate": 1e-06,
"loss": 0.0536,
"step": 1578
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.30790117383003235,
"epoch": 4.155263157894737,
"grad_norm": 1.5674898624420166,
"learning_rate": 1e-06,
"loss": 0.0615,
"step": 1579
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.31684084236621857,
"epoch": 4.157894736842105,
"grad_norm": 0.006289552431553602,
"learning_rate": 1e-06,
"loss": 0.049,
"step": 1580
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2869.0,
"completions/mean_length": 1460.595703125,
"completions/mean_terminated_length": 726.6577758789062,
"completions/min_length": 254.0,
"completions/min_terminated_length": 254.0,
"entropy": 0.2980321794748306,
"epoch": 4.160526315789474,
"frac_reward_zero_std": 0.375,
"grad_norm": 215.46676635742188,
"learning_rate": 1e-06,
"loss": 0.0781,
"num_tokens": 526672588.0,
"reward": 0.8091208934783936,
"reward_std": 0.16659247875213623,
"rewards/progression_diversity/mean": -0.015641260892152786,
"rewards/progression_diversity/std": 0.07609428465366364,
"rewards/symbolic_reward_accuracy/mean": 0.88671875,
"rewards/symbolic_reward_accuracy/std": 0.3172462284564972,
"rewards/symbolic_reward_partial_score/mean": 0.9378255009651184,
"rewards/symbolic_reward_partial_score/std": 0.20248618721961975,
"rewards/tag_count_reward/mean": -0.041015625,
"rewards/tag_count_reward/std": 0.19852031767368317,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.047804594039917,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 352.0,
"sampling/sampling_logp_difference/mean": 3.248504400253296,
"step": 1581
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.28611473739147186,
"epoch": 4.163157894736842,
"grad_norm": 5.547037124633789,
"learning_rate": 1e-06,
"loss": 0.0783,
"step": 1582
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.3098125159740448,
"epoch": 4.16578947368421,
"grad_norm": 0.0198947936296463,
"learning_rate": 1e-06,
"loss": 0.0761,
"step": 1583
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.3102129250764847,
"epoch": 4.168421052631579,
"grad_norm": 0.020427729934453964,
"learning_rate": 1e-06,
"loss": 0.0666,
"step": 1584
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2539.0,
"completions/mean_length": 1271.625,
"completions/mean_terminated_length": 689.2008056640625,
"completions/min_length": 250.0,
"completions/min_terminated_length": 250.0,
"entropy": 0.3241703659296036,
"epoch": 4.171052631578948,
"frac_reward_zero_std": 0.5,
"grad_norm": 19.84275245666504,
"learning_rate": 1e-06,
"loss": 0.0022,
"num_tokens": 527726124.0,
"reward": 0.8260881900787354,
"reward_std": 0.1229025349020958,
"rewards/progression_diversity/mean": -0.013252872042357922,
"rewards/progression_diversity/std": 0.07066137343645096,
"rewards/symbolic_reward_accuracy/mean": 0.908203125,
"rewards/symbolic_reward_accuracy/std": 0.289021372795105,
"rewards/symbolic_reward_partial_score/mean": 0.94482421875,
"rewards/symbolic_reward_partial_score/std": 0.19333821535110474,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0540461540222168,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 352.0,
"sampling/sampling_logp_difference/mean": 2.846632480621338,
"step": 1585
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.30269454419612885,
"epoch": 4.173684210526316,
"grad_norm": 1.6151353120803833,
"learning_rate": 1e-06,
"loss": 0.0914,
"step": 1586
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.32799434661865234,
"epoch": 4.176315789473684,
"grad_norm": 0.03124081902205944,
"learning_rate": 1e-06,
"loss": 0.007,
"step": 1587
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.3005277067422867,
"epoch": 4.178947368421053,
"grad_norm": 0.007625295780599117,
"learning_rate": 1e-06,
"loss": 0.1777,
"step": 1588
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2949.0,
"completions/mean_length": 1715.181640625,
"completions/mean_terminated_length": 671.7928466796875,
"completions/min_length": 242.0,
"completions/min_terminated_length": 242.0,
"entropy": 0.30243825912475586,
"epoch": 4.181578947368421,
"frac_reward_zero_std": 0.28125,
"grad_norm": 418.0889587402344,
"learning_rate": 1e-06,
"loss": 0.1181,
"num_tokens": 528990761.0,
"reward": 0.7599914073944092,
"reward_std": 0.15093812346458435,
"rewards/progression_diversity/mean": -0.021372389048337936,
"rewards/progression_diversity/std": 0.0902642160654068,
"rewards/symbolic_reward_accuracy/mean": 0.826171875,
"rewards/symbolic_reward_accuracy/std": 0.3793322443962097,
"rewards/symbolic_reward_partial_score/mean": 0.8992512822151184,
"rewards/symbolic_reward_partial_score/std": 0.2561655044555664,
"rewards/tag_count_reward/mean": -0.052734375,
"rewards/tag_count_reward/std": 0.22372129559516907,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0439589023590088,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 348.0,
"sampling/sampling_logp_difference/mean": 3.9997878074645996,
"step": 1589
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.29131756722927094,
"epoch": 4.184210526315789,
"grad_norm": 0.009967650286853313,
"learning_rate": 1e-06,
"loss": 0.1418,
"step": 1590
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.288121297955513,
"epoch": 4.186842105263158,
"grad_norm": 0.011154073290526867,
"learning_rate": 1e-06,
"loss": 0.1733,
"step": 1591
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.32074061036109924,
"epoch": 4.189473684210526,
"grad_norm": 0.015442883595824242,
"learning_rate": 1e-06,
"loss": 0.057,
"step": 1592
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3200.0,
"completions/mean_length": 1718.6875,
"completions/mean_terminated_length": 741.0000610351562,
"completions/min_length": 267.0,
"completions/min_terminated_length": 267.0,
"entropy": 0.3073541969060898,
"epoch": 4.192105263157894,
"frac_reward_zero_std": 0.5625,
"grad_norm": 62.47317123413086,
"learning_rate": 1e-06,
"loss": 0.0488,
"num_tokens": 530279049.0,
"reward": 0.776131272315979,
"reward_std": 0.11894486844539642,
"rewards/progression_diversity/mean": -0.01871039718389511,
"rewards/progression_diversity/std": 0.07976720482110977,
"rewards/symbolic_reward_accuracy/mean": 0.8515625,
"rewards/symbolic_reward_accuracy/std": 0.35588082671165466,
"rewards/symbolic_reward_partial_score/mean": 0.9021810293197632,
"rewards/symbolic_reward_partial_score/std": 0.25698360800743103,
"rewards/tag_count_reward/mean": -0.052734375,
"rewards/tag_count_reward/std": 0.22372129559516907,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0540015697479248,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 346.0,
"sampling/sampling_logp_difference/mean": 2.4201743602752686,
"step": 1593
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.29900647699832916,
"epoch": 4.1947368421052635,
"grad_norm": 103.9713363647461,
"learning_rate": 1e-06,
"loss": 0.0922,
"step": 1594
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.31466494500637054,
"epoch": 4.197368421052632,
"grad_norm": 0.0200185626745224,
"learning_rate": 1e-06,
"loss": 0.0658,
"step": 1595
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.078125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.30266304314136505,
"epoch": 4.2,
"grad_norm": 0.011375721544027328,
"learning_rate": 1e-06,
"loss": 0.0875,
"step": 1596
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3062.0,
"completions/mean_length": 1656.662109375,
"completions/mean_terminated_length": 740.0228881835938,
"completions/min_length": 262.0,
"completions/min_terminated_length": 262.0,
"entropy": 0.3035944253206253,
"epoch": 4.2026315789473685,
"frac_reward_zero_std": 0.3125,
"grad_norm": 208.2892608642578,
"learning_rate": 1e-06,
"loss": 0.0753,
"num_tokens": 531551420.0,
"reward": 0.7731266021728516,
"reward_std": 0.16756606101989746,
"rewards/progression_diversity/mean": -0.021324899047613144,
"rewards/progression_diversity/std": 0.08820579946041107,
"rewards/symbolic_reward_accuracy/mean": 0.83984375,
"rewards/symbolic_reward_accuracy/std": 0.3671095669269562,
"rewards/symbolic_reward_partial_score/mean": 0.9150390625,
"rewards/symbolic_reward_partial_score/std": 0.22819702327251434,
"rewards/tag_count_reward/mean": -0.05078125,
"rewards/tag_count_reward/std": 0.21976542472839355,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0414807796478271,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 350.0,
"sampling/sampling_logp_difference/mean": 3.9151625633239746,
"step": 1597
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.3109179735183716,
"epoch": 4.205263157894737,
"grad_norm": 0.0169313196092844,
"learning_rate": 1e-06,
"loss": 0.1055,
"step": 1598
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.31678929924964905,
"epoch": 4.207894736842105,
"grad_norm": 0.07289431989192963,
"learning_rate": 1e-06,
"loss": 0.0648,
"step": 1599
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.2846733182668686,
"epoch": 4.2105263157894735,
"grad_norm": 0.020705915987491608,
"learning_rate": 1e-06,
"loss": 0.1821,
"step": 1600
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4576.0,
"completions/mean_length": 2429.921875,
"completions/mean_terminated_length": 784.6812133789062,
"completions/min_length": 261.0,
"completions/min_terminated_length": 261.0,
"entropy": 0.2895173579454422,
"epoch": 4.213157894736842,
"frac_reward_zero_std": 0.40625,
"grad_norm": 806.2984619140625,
"learning_rate": 1e-06,
"loss": 0.1064,
"num_tokens": 533202356.0,
"reward": 0.7311378717422485,
"reward_std": 0.14681148529052734,
"rewards/progression_diversity/mean": -0.030749481171369553,
"rewards/progression_diversity/std": 0.10168987512588501,
"rewards/symbolic_reward_accuracy/mean": 0.794921875,
"rewards/symbolic_reward_accuracy/std": 0.4041535556316376,
"rewards/symbolic_reward_partial_score/mean": 0.876953125,
"rewards/symbolic_reward_partial_score/std": 0.2930140495300293,
"rewards/tag_count_reward/mean": -0.0859375,
"rewards/tag_count_reward/std": 0.28054583072662354,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0381789207458496,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 352.0,
"sampling/sampling_logp_difference/mean": 4.134341239929199,
"step": 1601
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.28859490156173706,
"epoch": 4.215789473684211,
"grad_norm": 0.0084293307736516,
"learning_rate": 1e-06,
"loss": 0.0704,
"step": 1602
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.2843547612428665,
"epoch": 4.218421052631579,
"grad_norm": 0.0123353386297822,
"learning_rate": 1e-06,
"loss": 0.1586,
"step": 1603
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.27717554569244385,
"epoch": 4.221052631578948,
"grad_norm": 0.020575549453496933,
"learning_rate": 1e-06,
"loss": 0.1076,
"step": 1604
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3167.0,
"completions/mean_length": 1855.916015625,
"completions/mean_terminated_length": 757.1533813476562,
"completions/min_length": 228.0,
"completions/min_terminated_length": 228.0,
"entropy": 0.2824231684207916,
"epoch": 4.223684210526316,
"frac_reward_zero_std": 0.34375,
"grad_norm": 216.76248168945312,
"learning_rate": 1e-06,
"loss": 0.1544,
"num_tokens": 534562409.0,
"reward": 0.7700595259666443,
"reward_std": 0.1850237250328064,
"rewards/progression_diversity/mean": -0.025300273671746254,
"rewards/progression_diversity/std": 0.09943580627441406,
"rewards/symbolic_reward_accuracy/mean": 0.83984375,
"rewards/symbolic_reward_accuracy/std": 0.3671095669269562,
"rewards/symbolic_reward_partial_score/mean": 0.908203125,
"rewards/symbolic_reward_partial_score/std": 0.24698437750339508,
"rewards/tag_count_reward/mean": -0.060546875,
"rewards/tag_count_reward/std": 0.2387305200099945,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0366488695144653,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 356.0,
"sampling/sampling_logp_difference/mean": 4.752798080444336,
"step": 1605
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.30687089264392853,
"epoch": 4.226315789473684,
"grad_norm": 0.03631648048758507,
"learning_rate": 1e-06,
"loss": 0.0767,
"step": 1606
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.2971319705247879,
"epoch": 4.228947368421053,
"grad_norm": 0.016691574826836586,
"learning_rate": 1e-06,
"loss": 0.1243,
"step": 1607
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.29933421313762665,
"epoch": 4.231578947368421,
"grad_norm": 0.024404583498835564,
"learning_rate": 1e-06,
"loss": 0.0898,
"step": 1608
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2285.0,
"completions/mean_length": 1166.900390625,
"completions/mean_terminated_length": 644.29296875,
"completions/min_length": 225.0,
"completions/min_terminated_length": 225.0,
"entropy": 0.3166111558675766,
"epoch": 4.234210526315789,
"frac_reward_zero_std": 0.53125,
"grad_norm": 546.3336791992188,
"learning_rate": 1e-06,
"loss": 0.0718,
"num_tokens": 535559318.0,
"reward": 0.8583120107650757,
"reward_std": 0.08758790791034698,
"rewards/progression_diversity/mean": -0.013530386611819267,
"rewards/progression_diversity/std": 0.0759081244468689,
"rewards/symbolic_reward_accuracy/mean": 0.94921875,
"rewards/symbolic_reward_accuracy/std": 0.21976542472839355,
"rewards/symbolic_reward_partial_score/mean": 0.9734700918197632,
"rewards/symbolic_reward_partial_score/std": 0.13808730244636536,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.050557017326355,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 356.0,
"sampling/sampling_logp_difference/mean": 3.118114471435547,
"step": 1609
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.31313398480415344,
"epoch": 4.2368421052631575,
"grad_norm": 0.01867660880088806,
"learning_rate": 1e-06,
"loss": 0.1433,
"step": 1610
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.31812839210033417,
"epoch": 4.239473684210527,
"grad_norm": 0.00977341365069151,
"learning_rate": 1e-06,
"loss": 0.0707,
"step": 1611
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.32100366055965424,
"epoch": 4.242105263157895,
"grad_norm": 0.006353206932544708,
"learning_rate": 1e-06,
"loss": 0.0522,
"step": 1612
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2459.0,
"completions/mean_length": 1447.01953125,
"completions/mean_terminated_length": 712.4138793945312,
"completions/min_length": 235.0,
"completions/min_terminated_length": 235.0,
"entropy": 0.286122590303421,
"epoch": 4.244736842105263,
"frac_reward_zero_std": 0.3125,
"grad_norm": 307.3343200683594,
"learning_rate": 1e-06,
"loss": 0.1766,
"num_tokens": 536710016.0,
"reward": 0.7893308401107788,
"reward_std": 0.1426737904548645,
"rewards/progression_diversity/mean": -0.017116881906986237,
"rewards/progression_diversity/std": 0.0786111056804657,
"rewards/symbolic_reward_accuracy/mean": 0.859375,
"rewards/symbolic_reward_accuracy/std": 0.3479743003845215,
"rewards/symbolic_reward_partial_score/mean": 0.9259439706802368,
"rewards/symbolic_reward_partial_score/std": 0.20985758304595947,
"rewards/tag_count_reward/mean": -0.0390625,
"rewards/tag_count_reward/std": 0.1939331740140915,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0420353412628174,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 358.0,
"sampling/sampling_logp_difference/mean": 4.076284408569336,
"step": 1613
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.29413002729415894,
"epoch": 4.247368421052632,
"grad_norm": 0.011863662861287594,
"learning_rate": 1e-06,
"loss": 0.1084,
"step": 1614
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.32583458721637726,
"epoch": 4.25,
"grad_norm": 0.009227105416357517,
"learning_rate": 1e-06,
"loss": 0.0131,
"step": 1615
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.30109357833862305,
"epoch": 4.252631578947368,
"grad_norm": 0.08455037325620651,
"learning_rate": 1e-06,
"loss": 0.0389,
"step": 1616
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3129.0,
"completions/mean_length": 1303.62109375,
"completions/mean_terminated_length": 690.5975341796875,
"completions/min_length": 241.0,
"completions/min_terminated_length": 241.0,
"entropy": 0.3038036525249481,
"epoch": 4.255263157894737,
"frac_reward_zero_std": 0.46875,
"grad_norm": 244.20498657226562,
"learning_rate": 1e-06,
"loss": 0.1133,
"num_tokens": 537764254.0,
"reward": 0.804751992225647,
"reward_std": 0.1407267451286316,
"rewards/progression_diversity/mean": -0.01308908686041832,
"rewards/progression_diversity/std": 0.06904362142086029,
"rewards/symbolic_reward_accuracy/mean": 0.880859375,
"rewards/symbolic_reward_accuracy/std": 0.32427072525024414,
"rewards/symbolic_reward_partial_score/mean": 0.9322916269302368,
"rewards/symbolic_reward_partial_score/std": 0.21590134501457214,
"rewards/tag_count_reward/mean": -0.033203125,
"rewards/tag_count_reward/std": 0.17934183776378632,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.049241065979004,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 360.0,
"sampling/sampling_logp_difference/mean": 3.09298038482666,
"step": 1617
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.31994016468524933,
"epoch": 4.257894736842105,
"grad_norm": 0.010734348557889462,
"learning_rate": 1e-06,
"loss": 0.0265,
"step": 1618
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.30486488342285156,
"epoch": 4.260526315789473,
"grad_norm": 0.010809490457177162,
"learning_rate": 1e-06,
"loss": 0.0587,
"step": 1619
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.2901280075311661,
"epoch": 4.2631578947368425,
"grad_norm": 0.03101206384599209,
"learning_rate": 1e-06,
"loss": 0.086,
"step": 1620
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3629.0,
"completions/mean_length": 1185.048828125,
"completions/mean_terminated_length": 663.064697265625,
"completions/min_length": 227.0,
"completions/min_terminated_length": 227.0,
"entropy": 0.3062327206134796,
"epoch": 4.265789473684211,
"frac_reward_zero_std": 0.34375,
"grad_norm": 202.47483825683594,
"learning_rate": 1e-06,
"loss": 0.0557,
"num_tokens": 538737751.0,
"reward": 0.8110127449035645,
"reward_std": 0.16020728647708893,
"rewards/progression_diversity/mean": -0.012012619525194168,
"rewards/progression_diversity/std": 0.06591516733169556,
"rewards/symbolic_reward_accuracy/mean": 0.88671875,
"rewards/symbolic_reward_accuracy/std": 0.3172462284564972,
"rewards/symbolic_reward_partial_score/mean": 0.9381510019302368,
"rewards/symbolic_reward_partial_score/std": 0.19927191734313965,
"rewards/tag_count_reward/mean": -0.0234375,
"rewards/tag_count_reward/std": 0.15143637359142303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.049810528755188,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 360.0,
"sampling/sampling_logp_difference/mean": 3.169362783432007,
"step": 1621
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.2987634390592575,
"epoch": 4.268421052631579,
"grad_norm": 141.50709533691406,
"learning_rate": 1e-06,
"loss": 0.1229,
"step": 1622
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.3115040361881256,
"epoch": 4.271052631578947,
"grad_norm": 0.011638534255325794,
"learning_rate": 1e-06,
"loss": 0.0587,
"step": 1623
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.34375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.31850266456604004,
"epoch": 4.273684210526316,
"grad_norm": 0.00664818100631237,
"learning_rate": 1e-06,
"loss": 0.0181,
"step": 1624
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2810.0,
"completions/mean_length": 1301.916015625,
"completions/mean_terminated_length": 752.3663940429688,
"completions/min_length": 271.0,
"completions/min_terminated_length": 271.0,
"entropy": 0.3117539584636688,
"epoch": 4.276315789473684,
"frac_reward_zero_std": 0.34375,
"grad_norm": 244.9304962158203,
"learning_rate": 1e-06,
"loss": 0.0613,
"num_tokens": 539834028.0,
"reward": 0.8020786046981812,
"reward_std": 0.16100260615348816,
"rewards/progression_diversity/mean": -0.011866888031363487,
"rewards/progression_diversity/std": 0.06344626098871231,
"rewards/symbolic_reward_accuracy/mean": 0.87109375,
"rewards/symbolic_reward_accuracy/std": 0.33542385697364807,
"rewards/symbolic_reward_partial_score/mean": 0.93896484375,
"rewards/symbolic_reward_partial_score/std": 0.1891382783651352,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0457390546798706,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 364.0,
"sampling/sampling_logp_difference/mean": 3.5810694694519043,
"step": 1625
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.2959267497062683,
"epoch": 4.278947368421052,
"grad_norm": 0.020144561305642128,
"learning_rate": 1e-06,
"loss": 0.1104,
"step": 1626
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.3018384575843811,
"epoch": 4.281578947368421,
"grad_norm": 0.014604386873543262,
"learning_rate": 1e-06,
"loss": 0.0735,
"step": 1627
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.30800075829029083,
"epoch": 4.284210526315789,
"grad_norm": 0.014517586678266525,
"learning_rate": 1e-06,
"loss": 0.0251,
"step": 1628
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3842.0,
"completions/mean_length": 956.640625,
"completions/mean_terminated_length": 711.761962890625,
"completions/min_length": 219.0,
"completions/min_terminated_length": 219.0,
"entropy": 0.3077455312013626,
"epoch": 4.286842105263158,
"frac_reward_zero_std": 0.5625,
"grad_norm": 398.7921447753906,
"learning_rate": 1e-06,
"loss": 0.0327,
"num_tokens": 540715284.0,
"reward": 0.847118616104126,
"reward_std": 0.09315143525600433,
"rewards/progression_diversity/mean": -0.004935206845402718,
"rewards/progression_diversity/std": 0.043800655752420425,
"rewards/symbolic_reward_accuracy/mean": 0.9296875,
"rewards/symbolic_reward_accuracy/std": 0.25592297315597534,
"rewards/symbolic_reward_partial_score/mean": 0.9690755605697632,
"rewards/symbolic_reward_partial_score/std": 0.13879333436489105,
"rewards/tag_count_reward/mean": -0.013671875,
"rewards/tag_count_reward/std": 0.1162383034825325,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.059929609298706,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 364.0,
"sampling/sampling_logp_difference/mean": 1.7687201499938965,
"step": 1629
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.3137192130088806,
"epoch": 4.2894736842105265,
"grad_norm": 0.01701788231730461,
"learning_rate": 1e-06,
"loss": 0.0515,
"step": 1630
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.32366204261779785,
"epoch": 4.292105263157895,
"grad_norm": 0.03292662650346756,
"learning_rate": 1e-06,
"loss": 0.0018,
"step": 1631
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.3094526529312134,
"epoch": 4.294736842105263,
"grad_norm": 0.02371911332011223,
"learning_rate": 1e-06,
"loss": 0.064,
"step": 1632
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4493.0,
"completions/mean_length": 1390.392578125,
"completions/mean_terminated_length": 717.2101440429688,
"completions/min_length": 230.0,
"completions/min_terminated_length": 230.0,
"entropy": 0.30759990215301514,
"epoch": 4.2973684210526315,
"frac_reward_zero_std": 0.59375,
"grad_norm": 396.1076965332031,
"learning_rate": 1e-06,
"loss": 0.082,
"num_tokens": 541829469.0,
"reward": 0.7947248220443726,
"reward_std": 0.0687393993139267,
"rewards/progression_diversity/mean": -0.009939271956682205,
"rewards/progression_diversity/std": 0.055136967450380325,
"rewards/symbolic_reward_accuracy/mean": 0.8671875,
"rewards/symbolic_reward_accuracy/std": 0.33970388770103455,
"rewards/symbolic_reward_partial_score/mean": 0.9267578125,
"rewards/symbolic_reward_partial_score/std": 0.2075078934431076,
"rewards/tag_count_reward/mean": -0.03515625,
"rewards/tag_count_reward/std": 0.1843547374010086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.054058313369751,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 364.0,
"sampling/sampling_logp_difference/mean": 2.6795334815979004,
"step": 1633
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.3071286678314209,
"epoch": 4.3,
"grad_norm": 0.010865924879908562,
"learning_rate": 1e-06,
"loss": 0.0643,
"step": 1634
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1328125,
"entropy": 0.31138360500335693,
"epoch": 4.302631578947368,
"grad_norm": 0.012006482109427452,
"learning_rate": 1e-06,
"loss": 0.0281,
"step": 1635
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.3028770089149475,
"epoch": 4.3052631578947365,
"grad_norm": 0.30845019221305847,
"learning_rate": 1e-06,
"loss": 0.0632,
"step": 1636
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017578125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2767.0,
"completions/mean_length": 868.23828125,
"completions/mean_terminated_length": 590.6202392578125,
"completions/min_length": 201.0,
"completions/min_terminated_length": 201.0,
"entropy": 0.3321658670902252,
"epoch": 4.307894736842106,
"frac_reward_zero_std": 0.53125,
"grad_norm": 0.008871855214238167,
"learning_rate": 1e-06,
"loss": 0.0084,
"num_tokens": 542626583.0,
"reward": 0.8462392091751099,
"reward_std": 0.11010201275348663,
"rewards/progression_diversity/mean": -0.0049893660470843315,
"rewards/progression_diversity/std": 0.04094173386693001,
"rewards/symbolic_reward_accuracy/mean": 0.927734375,
"rewards/symbolic_reward_accuracy/std": 0.2591804563999176,
"rewards/symbolic_reward_partial_score/mean": 0.970703125,
"rewards/symbolic_reward_partial_score/std": 0.12699371576309204,
"rewards/tag_count_reward/mean": -0.015625,
"rewards/tag_count_reward/std": 0.12414088100194931,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0612621307373047,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 366.0,
"sampling/sampling_logp_difference/mean": 2.047287940979004,
"step": 1637
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.3181557357311249,
"epoch": 4.310526315789474,
"grad_norm": 0.023050582036376,
"learning_rate": 1e-06,
"loss": 0.0581,
"step": 1638
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.3208106458187103,
"epoch": 4.313157894736842,
"grad_norm": 0.01196917425841093,
"learning_rate": 1e-06,
"loss": 0.0703,
"step": 1639
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.3326588422060013,
"epoch": 4.315789473684211,
"grad_norm": 0.011661848984658718,
"learning_rate": 1e-06,
"loss": -0.0092,
"step": 1640
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2995.0,
"completions/mean_length": 1302.34765625,
"completions/mean_terminated_length": 752.8137817382812,
"completions/min_length": 220.0,
"completions/min_terminated_length": 220.0,
"entropy": 0.2890154719352722,
"epoch": 4.318421052631579,
"frac_reward_zero_std": 0.40625,
"grad_norm": 436.7637939453125,
"learning_rate": 1e-06,
"loss": 0.1363,
"num_tokens": 543714537.0,
"reward": 0.8115625977516174,
"reward_std": 0.11080306768417358,
"rewards/progression_diversity/mean": -0.010732462629675865,
"rewards/progression_diversity/std": 0.062084443867206573,
"rewards/symbolic_reward_accuracy/mean": 0.88671875,
"rewards/symbolic_reward_accuracy/std": 0.3172462284564972,
"rewards/symbolic_reward_partial_score/mean": 0.9412435293197632,
"rewards/symbolic_reward_partial_score/std": 0.18950168788433075,
"rewards/tag_count_reward/mean": -0.02734375,
"rewards/tag_count_reward/std": 0.16324250400066376,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.048242211341858,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 366.0,
"sampling/sampling_logp_difference/mean": 3.6092000007629395,
"step": 1641
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.2962608188390732,
"epoch": 4.321052631578947,
"grad_norm": 0.027114521712064743,
"learning_rate": 1e-06,
"loss": 0.0896,
"step": 1642
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.3004533350467682,
"epoch": 4.323684210526316,
"grad_norm": 0.021577974781394005,
"learning_rate": 1e-06,
"loss": 0.0285,
"step": 1643
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.30283403396606445,
"epoch": 4.326315789473684,
"grad_norm": 0.01051260158419609,
"learning_rate": 1e-06,
"loss": 0.0431,
"step": 1644
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.056640625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3053.0,
"completions/mean_length": 1617.533203125,
"completions/mean_terminated_length": 730.9337768554688,
"completions/min_length": 253.0,
"completions/min_terminated_length": 253.0,
"entropy": 0.28192102909088135,
"epoch": 4.328947368421053,
"frac_reward_zero_std": 0.40625,
"grad_norm": 224.36424255371094,
"learning_rate": 1e-06,
"loss": 0.0977,
"num_tokens": 544947706.0,
"reward": 0.7883968949317932,
"reward_std": 0.1514188051223755,
"rewards/progression_diversity/mean": -0.017739124596118927,
"rewards/progression_diversity/std": 0.07712989300489426,
"rewards/symbolic_reward_accuracy/mean": 0.859375,
"rewards/symbolic_reward_accuracy/std": 0.3479743003845215,
"rewards/symbolic_reward_partial_score/mean": 0.9241536259651184,
"rewards/symbolic_reward_partial_score/std": 0.2255030870437622,
"rewards/tag_count_reward/mean": -0.04296875,
"rewards/tag_count_reward/std": 0.2029850035905838,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0469114780426025,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 368.0,
"sampling/sampling_logp_difference/mean": 3.760016441345215,
"step": 1645
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.30050475895404816,
"epoch": 4.331578947368421,
"grad_norm": 0.010516730137169361,
"learning_rate": 1e-06,
"loss": 0.0643,
"step": 1646
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.29950810968875885,
"epoch": 4.33421052631579,
"grad_norm": 4.544126510620117,
"learning_rate": 1e-06,
"loss": 0.0271,
"step": 1647
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.29230254888534546,
"epoch": 4.336842105263158,
"grad_norm": 4.5623393058776855,
"learning_rate": 1e-06,
"loss": 0.0658,
"step": 1648
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3712.0,
"completions/mean_length": 1151.015625,
"completions/mean_terminated_length": 659.6290283203125,
"completions/min_length": 190.0,
"completions/min_terminated_length": 190.0,
"entropy": 0.31469690799713135,
"epoch": 4.339473684210526,
"frac_reward_zero_std": 0.53125,
"grad_norm": 301.08685302734375,
"learning_rate": 1e-06,
"loss": 0.0391,
"num_tokens": 545939170.0,
"reward": 0.8617491722106934,
"reward_std": 0.09493400156497955,
"rewards/progression_diversity/mean": -0.011606581509113312,
"rewards/progression_diversity/std": 0.06665636599063873,
"rewards/symbolic_reward_accuracy/mean": 0.94921875,
"rewards/symbolic_reward_accuracy/std": 0.21976542472839355,
"rewards/symbolic_reward_partial_score/mean": 0.9777017831802368,
"rewards/symbolic_reward_partial_score/std": 0.12183935195207596,
"rewards/tag_count_reward/mean": -0.009765625,
"rewards/tag_count_reward/std": 0.09843364357948303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0512254238128662,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 366.0,
"sampling/sampling_logp_difference/mean": 3.1432154178619385,
"step": 1649
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.30177412927150726,
"epoch": 4.342105263157895,
"grad_norm": 318.8957214355469,
"learning_rate": 1e-06,
"loss": 0.0878,
"step": 1650
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.2994520217180252,
"epoch": 4.344736842105263,
"grad_norm": 0.24904422461986542,
"learning_rate": 1e-06,
"loss": 0.0613,
"step": 1651
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.3131738752126694,
"epoch": 4.347368421052631,
"grad_norm": 0.016566958278417587,
"learning_rate": 1e-06,
"loss": 0.0589,
"step": 1652
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.048828125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2135.0,
"completions/mean_length": 1417.859375,
"completions/mean_terminated_length": 649.5770263671875,
"completions/min_length": 188.0,
"completions/min_terminated_length": 188.0,
"entropy": 0.2909892499446869,
"epoch": 4.35,
"frac_reward_zero_std": 0.5,
"grad_norm": 281.6248474121094,
"learning_rate": 1e-06,
"loss": 0.0083,
"num_tokens": 547068762.0,
"reward": 0.8414824604988098,
"reward_std": 0.0903344452381134,
"rewards/progression_diversity/mean": -0.016799690201878548,
"rewards/progression_diversity/std": 0.07866127043962479,
"rewards/symbolic_reward_accuracy/mean": 0.921875,
"rewards/symbolic_reward_accuracy/std": 0.26863065361976624,
"rewards/symbolic_reward_partial_score/mean": 0.9656575322151184,
"rewards/symbolic_reward_partial_score/std": 0.1427777260541916,
"rewards/tag_count_reward/mean": -0.01171875,
"rewards/tag_count_reward/std": 0.10772226005792618,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0441806316375732,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 368.0,
"sampling/sampling_logp_difference/mean": 4.0731892585754395,
"step": 1653
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.29775403439998627,
"epoch": 4.352631578947369,
"grad_norm": 1.3435653448104858,
"learning_rate": 1e-06,
"loss": 0.0371,
"step": 1654
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.27768969535827637,
"epoch": 4.355263157894737,
"grad_norm": 0.009355852380394936,
"learning_rate": 1e-06,
"loss": 0.1486,
"step": 1655
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.3120851516723633,
"epoch": 4.3578947368421055,
"grad_norm": 0.5682896375656128,
"learning_rate": 1e-06,
"loss": 0.0239,
"step": 1656
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3438.0,
"completions/mean_length": 1578.765625,
"completions/mean_terminated_length": 722.264404296875,
"completions/min_length": 238.0,
"completions/min_terminated_length": 238.0,
"entropy": 0.2924637943506241,
"epoch": 4.360526315789474,
"frac_reward_zero_std": 0.375,
"grad_norm": 165.94813537597656,
"learning_rate": 1e-06,
"loss": 0.0048,
"num_tokens": 548296738.0,
"reward": 0.8284804821014404,
"reward_std": 0.11700120568275452,
"rewards/progression_diversity/mean": -0.018165672197937965,
"rewards/progression_diversity/std": 0.07916979491710663,
"rewards/symbolic_reward_accuracy/mean": 0.91015625,
"rewards/symbolic_reward_accuracy/std": 0.2862374484539032,
"rewards/symbolic_reward_partial_score/mean": 0.9549153447151184,
"rewards/symbolic_reward_partial_score/std": 0.1701822131872177,
"rewards/tag_count_reward/mean": -0.0390625,
"rewards/tag_count_reward/std": 0.1939331740140915,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.041003704071045,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 366.0,
"sampling/sampling_logp_difference/mean": 4.021053314208984,
"step": 1657
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.2971417158842087,
"epoch": 4.363157894736842,
"grad_norm": 0.03589131683111191,
"learning_rate": 1e-06,
"loss": 0.082,
"step": 1658
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.28301428258419037,
"epoch": 4.36578947368421,
"grad_norm": 0.021585499867796898,
"learning_rate": 1e-06,
"loss": 0.1136,
"step": 1659
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.28591735661029816,
"epoch": 4.368421052631579,
"grad_norm": 0.006448809057474136,
"learning_rate": 1e-06,
"loss": 0.131,
"step": 1660
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2655.0,
"completions/mean_length": 1222.650390625,
"completions/mean_terminated_length": 701.9575805664062,
"completions/min_length": 225.0,
"completions/min_terminated_length": 225.0,
"entropy": 0.29014408588409424,
"epoch": 4.371052631578947,
"frac_reward_zero_std": 0.4375,
"grad_norm": 705.5008544921875,
"learning_rate": 1e-06,
"loss": 0.0896,
"num_tokens": 549336047.0,
"reward": 0.8229233026504517,
"reward_std": 0.11057721078395844,
"rewards/progression_diversity/mean": -0.012355408631265163,
"rewards/progression_diversity/std": 0.06910999119281769,
"rewards/symbolic_reward_accuracy/mean": 0.892578125,
"rewards/symbolic_reward_accuracy/std": 0.30995169281959534,
"rewards/symbolic_reward_partial_score/mean": 0.9622395634651184,
"rewards/symbolic_reward_partial_score/std": 0.1456439197063446,
"rewards/tag_count_reward/mean": -0.01171875,
"rewards/tag_count_reward/std": 0.10772226005792618,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0430617332458496,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 366.0,
"sampling/sampling_logp_difference/mean": 3.94387149810791,
"step": 1661
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.2902925908565521,
"epoch": 4.373684210526315,
"grad_norm": 184.36441040039062,
"learning_rate": 1e-06,
"loss": 0.1117,
"step": 1662
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.2993343472480774,
"epoch": 4.376315789473685,
"grad_norm": 0.006076624616980553,
"learning_rate": 1e-06,
"loss": 0.0502,
"step": 1663
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.30178727209568024,
"epoch": 4.378947368421053,
"grad_norm": 0.00712067075073719,
"learning_rate": 1e-06,
"loss": 0.0184,
"step": 1664
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.060546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3090.0,
"completions/mean_length": 1699.12109375,
"completions/mean_terminated_length": 752.6943969726562,
"completions/min_length": 250.0,
"completions/min_terminated_length": 250.0,
"entropy": 0.2760689854621887,
"epoch": 4.381578947368421,
"frac_reward_zero_std": 0.34375,
"grad_norm": 577.8743896484375,
"learning_rate": 1e-06,
"loss": 0.0909,
"num_tokens": 550629997.0,
"reward": 0.8087052702903748,
"reward_std": 0.1341739296913147,
"rewards/progression_diversity/mean": -0.018151385709643364,
"rewards/progression_diversity/std": 0.07787839323282242,
"rewards/symbolic_reward_accuracy/mean": 0.880859375,
"rewards/symbolic_reward_accuracy/std": 0.32427072525024414,
"rewards/symbolic_reward_partial_score/mean": 0.9462890625,
"rewards/symbolic_reward_partial_score/std": 0.18004465103149414,
"rewards/tag_count_reward/mean": -0.03515625,
"rewards/tag_count_reward/std": 0.1843547374010086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0394903421401978,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 364.0,
"sampling/sampling_logp_difference/mean": 4.506129741668701,
"step": 1665
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.26462263613939285,
"epoch": 4.38421052631579,
"grad_norm": 1588.7640380859375,
"learning_rate": 1e-06,
"loss": 0.3391,
"step": 1666
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.29205040633678436,
"epoch": 4.386842105263158,
"grad_norm": 0.009880495257675648,
"learning_rate": 1e-06,
"loss": 0.0564,
"step": 1667
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.28884167969226837,
"epoch": 4.389473684210526,
"grad_norm": 0.010468707419931889,
"learning_rate": 1e-06,
"loss": 0.0718,
"step": 1668
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.052734375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3253.0,
"completions/mean_length": 1559.23828125,
"completions/mean_terminated_length": 733.9423217773438,
"completions/min_length": 238.0,
"completions/min_terminated_length": 238.0,
"entropy": 0.26679860055446625,
"epoch": 4.3921052631578945,
"frac_reward_zero_std": 0.4375,
"grad_norm": 465.1615295410156,
"learning_rate": 1e-06,
"loss": 0.1201,
"num_tokens": 551842663.0,
"reward": 0.7915906310081482,
"reward_std": 0.1362362802028656,
"rewards/progression_diversity/mean": -0.015742970630526543,
"rewards/progression_diversity/std": 0.07118507474660873,
"rewards/symbolic_reward_accuracy/mean": 0.861328125,
"rewards/symbolic_reward_accuracy/std": 0.34594178199768066,
"rewards/symbolic_reward_partial_score/mean": 0.9275715947151184,
"rewards/symbolic_reward_partial_score/std": 0.2216237634420395,
"rewards/tag_count_reward/mean": -0.033203125,
"rewards/tag_count_reward/std": 0.17934183776378632,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0484164953231812,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 366.0,
"sampling/sampling_logp_difference/mean": 3.136202096939087,
"step": 1669
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.2917633354663849,
"epoch": 4.394736842105263,
"grad_norm": 0.009237710386514664,
"learning_rate": 1e-06,
"loss": 0.0597,
"step": 1670
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.29464130103588104,
"epoch": 4.397368421052631,
"grad_norm": 0.020612401887774467,
"learning_rate": 1e-06,
"loss": 0.0545,
"step": 1671
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.29814429581165314,
"epoch": 4.4,
"grad_norm": 0.023844098672270775,
"learning_rate": 1e-06,
"loss": 0.0525,
"step": 1672
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.044921875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3713.0,
"completions/mean_length": 1361.876953125,
"completions/mean_terminated_length": 655.31494140625,
"completions/min_length": 216.0,
"completions/min_terminated_length": 216.0,
"entropy": 0.29356473684310913,
"epoch": 4.402631578947369,
"frac_reward_zero_std": 0.375,
"grad_norm": 48.91466522216797,
"learning_rate": 1e-06,
"loss": 0.0294,
"num_tokens": 552934888.0,
"reward": 0.8409208655357361,
"reward_std": 0.11717473715543747,
"rewards/progression_diversity/mean": -0.01435843575745821,
"rewards/progression_diversity/std": 0.06984733790159225,
"rewards/symbolic_reward_accuracy/mean": 0.927734375,
"rewards/symbolic_reward_accuracy/std": 0.2591804563999176,
"rewards/symbolic_reward_partial_score/mean": 0.95849609375,
"rewards/symbolic_reward_partial_score/std": 0.1762569397687912,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0481362342834473,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 366.0,
"sampling/sampling_logp_difference/mean": 3.0723586082458496,
"step": 1673
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.28971320390701294,
"epoch": 4.405263157894737,
"grad_norm": 17.948726654052734,
"learning_rate": 1e-06,
"loss": 0.1098,
"step": 1674
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.29446932673454285,
"epoch": 4.407894736842105,
"grad_norm": 0.010464577004313469,
"learning_rate": 1e-06,
"loss": 0.0555,
"step": 1675
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.28618983924388885,
"epoch": 4.410526315789474,
"grad_norm": 0.010811169631779194,
"learning_rate": 1e-06,
"loss": 0.1078,
"step": 1676
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2688.0,
"completions/mean_length": 1110.826171875,
"completions/mean_terminated_length": 649.8651733398438,
"completions/min_length": 244.0,
"completions/min_terminated_length": 244.0,
"entropy": 0.29554495215415955,
"epoch": 4.413157894736842,
"frac_reward_zero_std": 0.3125,
"grad_norm": 467.7513427734375,
"learning_rate": 1e-06,
"loss": 0.1092,
"num_tokens": 553879247.0,
"reward": 0.8643869161605835,
"reward_std": 0.11433817446231842,
"rewards/progression_diversity/mean": -0.011511346325278282,
"rewards/progression_diversity/std": 0.06805232167243958,
"rewards/symbolic_reward_accuracy/mean": 0.953125,
"rewards/symbolic_reward_accuracy/std": 0.21157780289649963,
"rewards/symbolic_reward_partial_score/mean": 0.9793294668197632,
"rewards/symbolic_reward_partial_score/std": 0.1068115308880806,
"rewards/tag_count_reward/mean": -0.01171875,
"rewards/tag_count_reward/std": 0.10772226005792618,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0465220212936401,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 368.0,
"sampling/sampling_logp_difference/mean": 3.3040924072265625,
"step": 1677
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3359375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.2996697723865509,
"epoch": 4.41578947368421,
"grad_norm": 0.005097354296594858,
"learning_rate": 1e-06,
"loss": 0.0766,
"step": 1678
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.30967849493026733,
"epoch": 4.418421052631579,
"grad_norm": 0.008583194576203823,
"learning_rate": 1e-06,
"loss": 0.0206,
"step": 1679
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.2984393239021301,
"epoch": 4.421052631578947,
"grad_norm": 0.020960450172424316,
"learning_rate": 1e-06,
"loss": 0.0607,
"step": 1680
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3201.0,
"completions/mean_length": 1718.21875,
"completions/mean_terminated_length": 805.4108276367188,
"completions/min_length": 252.0,
"completions/min_terminated_length": 252.0,
"entropy": 0.2852473855018616,
"epoch": 4.423684210526316,
"frac_reward_zero_std": 0.46875,
"grad_norm": 566.0900268554688,
"learning_rate": 1e-06,
"loss": 0.1011,
"num_tokens": 555183295.0,
"reward": 0.8038582801818848,
"reward_std": 0.10909964144229889,
"rewards/progression_diversity/mean": -0.019448932260274887,
"rewards/progression_diversity/std": 0.08251741528511047,
"rewards/symbolic_reward_accuracy/mean": 0.87890625,
"rewards/symbolic_reward_accuracy/std": 0.3265552520751953,
"rewards/symbolic_reward_partial_score/mean": 0.93798828125,
"rewards/symbolic_reward_partial_score/std": 0.2074093222618103,
"rewards/tag_count_reward/mean": -0.046875,
"rewards/tag_count_reward/std": 0.21157780289649963,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0440826416015625,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 368.0,
"sampling/sampling_logp_difference/mean": 3.4236390590667725,
"step": 1681
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.2848607301712036,
"epoch": 4.426315789473684,
"grad_norm": 0.014388111419975758,
"learning_rate": 1e-06,
"loss": 0.0661,
"step": 1682
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.279374361038208,
"epoch": 4.428947368421053,
"grad_norm": 0.014383941888809204,
"learning_rate": 1e-06,
"loss": 0.0695,
"step": 1683
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.2949568033218384,
"epoch": 4.431578947368421,
"grad_norm": 0.015598650090396404,
"learning_rate": 1e-06,
"loss": 0.0831,
"step": 1684
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.048828125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15587.0,
"completions/mean_length": 1525.705078125,
"completions/mean_terminated_length": 762.958984375,
"completions/min_length": 197.0,
"completions/min_terminated_length": 197.0,
"entropy": 0.300376757979393,
"epoch": 4.434210526315789,
"frac_reward_zero_std": 0.3125,
"grad_norm": 411.582275390625,
"learning_rate": 1e-06,
"loss": 0.0576,
"num_tokens": 556383464.0,
"reward": 0.8135613203048706,
"reward_std": 0.15236347913742065,
"rewards/progression_diversity/mean": -0.015942061319947243,
"rewards/progression_diversity/std": 0.07582908123731613,
"rewards/symbolic_reward_accuracy/mean": 0.892578125,
"rewards/symbolic_reward_accuracy/std": 0.30995169281959534,
"rewards/symbolic_reward_partial_score/mean": 0.9415689706802368,
"rewards/symbolic_reward_partial_score/std": 0.20094631612300873,
"rewards/tag_count_reward/mean": -0.04296875,
"rewards/tag_count_reward/std": 0.2029850035905838,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0429538488388062,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 368.0,
"sampling/sampling_logp_difference/mean": 3.6773757934570312,
"step": 1685
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.2941613495349884,
"epoch": 4.436842105263158,
"grad_norm": 0.00848439708352089,
"learning_rate": 1e-06,
"loss": 0.0689,
"step": 1686
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.2935366630554199,
"epoch": 4.439473684210526,
"grad_norm": 6.972188949584961,
"learning_rate": 1e-06,
"loss": 0.0504,
"step": 1687
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.2752288281917572,
"epoch": 4.442105263157894,
"grad_norm": 0.005782074760645628,
"learning_rate": 1e-06,
"loss": 0.1494,
"step": 1688
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2501.0,
"completions/mean_length": 1201.7578125,
"completions/mean_terminated_length": 743.5411987304688,
"completions/min_length": 225.0,
"completions/min_terminated_length": 225.0,
"entropy": 0.29591110348701477,
"epoch": 4.4447368421052635,
"frac_reward_zero_std": 0.375,
"grad_norm": 890.4512939453125,
"learning_rate": 1e-06,
"loss": 0.0993,
"num_tokens": 557391724.0,
"reward": 0.8423638343811035,
"reward_std": 0.1323879361152649,
"rewards/progression_diversity/mean": -0.011668046936392784,
"rewards/progression_diversity/std": 0.06880556792020798,
"rewards/symbolic_reward_accuracy/mean": 0.92578125,
"rewards/symbolic_reward_accuracy/std": 0.2623828947544098,
"rewards/symbolic_reward_partial_score/mean": 0.9638671875,
"rewards/symbolic_reward_partial_score/std": 0.1620253175497055,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0433096885681152,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 368.0,
"sampling/sampling_logp_difference/mean": 3.715100049972534,
"step": 1689
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.3018334209918976,
"epoch": 4.447368421052632,
"grad_norm": 0.013348652981221676,
"learning_rate": 1e-06,
"loss": 0.0512,
"step": 1690
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.2954847514629364,
"epoch": 4.45,
"grad_norm": 0.00606268085539341,
"learning_rate": 1e-06,
"loss": 0.0952,
"step": 1691
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.30556538701057434,
"epoch": 4.4526315789473685,
"grad_norm": 0.008453385904431343,
"learning_rate": 1e-06,
"loss": 0.0245,
"step": 1692
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.041015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3241.0,
"completions/mean_length": 1361.220703125,
"completions/mean_terminated_length": 718.6986083984375,
"completions/min_length": 264.0,
"completions/min_terminated_length": 264.0,
"entropy": 0.2938489019870758,
"epoch": 4.455263157894737,
"frac_reward_zero_std": 0.4375,
"grad_norm": 246.77127075195312,
"learning_rate": 1e-06,
"loss": 0.0901,
"num_tokens": 558478461.0,
"reward": 0.8210785984992981,
"reward_std": 0.12946206331253052,
"rewards/progression_diversity/mean": -0.011284667067229748,
"rewards/progression_diversity/std": 0.06149439141154289,
"rewards/symbolic_reward_accuracy/mean": 0.8984375,
"rewards/symbolic_reward_accuracy/std": 0.30236753821372986,
"rewards/symbolic_reward_partial_score/mean": 0.9495442509651184,
"rewards/symbolic_reward_partial_score/std": 0.181060791015625,
"rewards/tag_count_reward/mean": -0.02734375,
"rewards/tag_count_reward/std": 0.16324250400066376,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.046877145767212,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 368.0,
"sampling/sampling_logp_difference/mean": 3.4112725257873535,
"step": 1693
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.3013473302125931,
"epoch": 4.457894736842105,
"grad_norm": 0.0174638070166111,
"learning_rate": 1e-06,
"loss": 0.0643,
"step": 1694
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.2852264940738678,
"epoch": 4.4605263157894735,
"grad_norm": 0.011137026362121105,
"learning_rate": 1e-06,
"loss": 0.1265,
"step": 1695
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.30904076993465424,
"epoch": 4.463157894736842,
"grad_norm": 0.007532245479524136,
"learning_rate": 1e-06,
"loss": 0.0096,
"step": 1696
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.056640625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3318.0,
"completions/mean_length": 1733.322265625,
"completions/mean_terminated_length": 853.6749877929688,
"completions/min_length": 244.0,
"completions/min_terminated_length": 244.0,
"entropy": 0.275277316570282,
"epoch": 4.465789473684211,
"frac_reward_zero_std": 0.5,
"grad_norm": 413.572509765625,
"learning_rate": 1e-06,
"loss": 0.1591,
"num_tokens": 559778914.0,
"reward": 0.8111288547515869,
"reward_std": 0.0881713479757309,
"rewards/progression_diversity/mean": -0.015045925043523312,
"rewards/progression_diversity/std": 0.06671184301376343,
"rewards/symbolic_reward_accuracy/mean": 0.89453125,
"rewards/symbolic_reward_accuracy/std": 0.3074568510055542,
"rewards/symbolic_reward_partial_score/mean": 0.93212890625,
"rewards/symbolic_reward_partial_score/std": 0.21600748598575592,
"rewards/tag_count_reward/mean": -0.05078125,
"rewards/tag_count_reward/std": 0.21976542472839355,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0425660610198975,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 368.0,
"sampling/sampling_logp_difference/mean": 3.744896650314331,
"step": 1697
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.2827970087528229,
"epoch": 4.468421052631579,
"grad_norm": 0.008820455521345139,
"learning_rate": 1e-06,
"loss": 0.0781,
"step": 1698
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.27910783886909485,
"epoch": 4.471052631578948,
"grad_norm": 0.009887280873954296,
"learning_rate": 1e-06,
"loss": 0.0669,
"step": 1699
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.2814963161945343,
"epoch": 4.473684210526316,
"grad_norm": 0.011021401733160019,
"learning_rate": 1e-06,
"loss": 0.083,
"step": 1700
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2883.0,
"completions/mean_length": 1051.0390625,
"completions/mean_terminated_length": 745.6016235351562,
"completions/min_length": 262.0,
"completions/min_terminated_length": 262.0,
"entropy": 0.29189206659793854,
"epoch": 4.476315789473684,
"frac_reward_zero_std": 0.53125,
"grad_norm": 435.6475830078125,
"learning_rate": 1e-06,
"loss": 0.0678,
"num_tokens": 560717846.0,
"reward": 0.8404079675674438,
"reward_std": 0.11957277357578278,
"rewards/progression_diversity/mean": -0.007058565504848957,
"rewards/progression_diversity/std": 0.05117020010948181,
"rewards/symbolic_reward_accuracy/mean": 0.919921875,
"rewards/symbolic_reward_accuracy/std": 0.271679550409317,
"rewards/symbolic_reward_partial_score/mean": 0.9676106572151184,
"rewards/symbolic_reward_partial_score/std": 0.13442426919937134,
"rewards/tag_count_reward/mean": -0.017578125,
"rewards/tag_count_reward/std": 0.13154059648513794,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0530710220336914,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 368.0,
"sampling/sampling_logp_difference/mean": 2.474128484725952,
"step": 1701
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.2915034145116806,
"epoch": 4.478947368421053,
"grad_norm": 0.009802755899727345,
"learning_rate": 1e-06,
"loss": 0.0604,
"step": 1702
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.30035799741744995,
"epoch": 4.481578947368421,
"grad_norm": 0.007560526020824909,
"learning_rate": 1e-06,
"loss": 0.0481,
"step": 1703
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.3007005453109741,
"epoch": 4.484210526315789,
"grad_norm": 0.007429624907672405,
"learning_rate": 1e-06,
"loss": 0.0331,
"step": 1704
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.044921875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3036.0,
"completions/mean_length": 1491.83984375,
"completions/mean_terminated_length": 791.3905639648438,
"completions/min_length": 220.0,
"completions/min_terminated_length": 220.0,
"entropy": 0.2913738787174225,
"epoch": 4.4868421052631575,
"frac_reward_zero_std": 0.34375,
"grad_norm": 304.2225341796875,
"learning_rate": 1e-06,
"loss": 0.0793,
"num_tokens": 561894820.0,
"reward": 0.8073344230651855,
"reward_std": 0.1524999737739563,
"rewards/progression_diversity/mean": -0.013631231151521206,
"rewards/progression_diversity/std": 0.06903047114610672,
"rewards/symbolic_reward_accuracy/mean": 0.880859375,
"rewards/symbolic_reward_accuracy/std": 0.32427072525024414,
"rewards/symbolic_reward_partial_score/mean": 0.9435221552848816,
"rewards/symbolic_reward_partial_score/std": 0.19161830842494965,
"rewards/tag_count_reward/mean": -0.041015625,
"rewards/tag_count_reward/std": 0.19852031767368317,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0431602001190186,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 368.0,
"sampling/sampling_logp_difference/mean": 3.739988088607788,
"step": 1705
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.2814719080924988,
"epoch": 4.489473684210527,
"grad_norm": 0.014499134384095669,
"learning_rate": 1e-06,
"loss": 0.1048,
"step": 1706
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.294638067483902,
"epoch": 4.492105263157895,
"grad_norm": 0.01003054529428482,
"learning_rate": 1e-06,
"loss": 0.0582,
"step": 1707
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.2986578643321991,
"epoch": 4.494736842105263,
"grad_norm": 0.014435573481023312,
"learning_rate": 1e-06,
"loss": 0.0615,
"step": 1708
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2514.0,
"completions/mean_length": 1135.01953125,
"completions/mean_terminated_length": 769.0440063476562,
"completions/min_length": 245.0,
"completions/min_terminated_length": 245.0,
"entropy": 0.3132117986679077,
"epoch": 4.497368421052632,
"frac_reward_zero_std": 0.59375,
"grad_norm": 0.014912238344550133,
"learning_rate": 1e-06,
"loss": 0.0168,
"num_tokens": 562862734.0,
"reward": 0.832605242729187,
"reward_std": 0.09201730787754059,
"rewards/progression_diversity/mean": -0.006079651415348053,
"rewards/progression_diversity/std": 0.044652536511421204,
"rewards/symbolic_reward_accuracy/mean": 0.912109375,
"rewards/symbolic_reward_accuracy/std": 0.2834126651287079,
"rewards/symbolic_reward_partial_score/mean": 0.95849609375,
"rewards/symbolic_reward_partial_score/std": 0.15315403044223785,
"rewards/tag_count_reward/mean": -0.021484375,
"rewards/tag_count_reward/std": 0.14513419568538666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.052751064300537,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 370.0,
"sampling/sampling_logp_difference/mean": 2.5634231567382812,
"step": 1709
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.3069326877593994,
"epoch": 4.5,
"grad_norm": 0.00920610036700964,
"learning_rate": 1e-06,
"loss": 0.0197,
"step": 1710
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.28757910430431366,
"epoch": 4.502631578947368,
"grad_norm": 0.006706486456096172,
"learning_rate": 1e-06,
"loss": 0.1082,
"step": 1711
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.29572418332099915,
"epoch": 4.505263157894737,
"grad_norm": 0.008948331698775291,
"learning_rate": 1e-06,
"loss": 0.0682,
"step": 1712
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3745.0,
"completions/mean_length": 1313.30078125,
"completions/mean_terminated_length": 795.7212524414062,
"completions/min_length": 271.0,
"completions/min_terminated_length": 271.0,
"entropy": 0.29290150105953217,
"epoch": 4.507894736842105,
"frac_reward_zero_std": 0.59375,
"grad_norm": 150.0733184814453,
"learning_rate": 1e-06,
"loss": 0.0185,
"num_tokens": 563937928.0,
"reward": 0.8227621912956238,
"reward_std": 0.09591831266880035,
"rewards/progression_diversity/mean": -0.008939700201153755,
"rewards/progression_diversity/std": 0.055079780519008636,
"rewards/symbolic_reward_accuracy/mean": 0.896484375,
"rewards/symbolic_reward_accuracy/std": 0.30492907762527466,
"rewards/symbolic_reward_partial_score/mean": 0.958984375,
"rewards/symbolic_reward_partial_score/std": 0.14947035908699036,
"rewards/tag_count_reward/mean": -0.02734375,
"rewards/tag_count_reward/std": 0.16324250400066376,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0529078245162964,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 370.0,
"sampling/sampling_logp_difference/mean": 2.372925281524658,
"step": 1713
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.3026455044746399,
"epoch": 4.510526315789473,
"grad_norm": 0.013394175097346306,
"learning_rate": 1e-06,
"loss": 0.0088,
"step": 1714
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.29875096678733826,
"epoch": 4.5131578947368425,
"grad_norm": 0.007891521789133549,
"learning_rate": 1e-06,
"loss": 0.0854,
"step": 1715
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.29000018537044525,
"epoch": 4.515789473684211,
"grad_norm": 0.010807064361870289,
"learning_rate": 1e-06,
"loss": 0.0695,
"step": 1716
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3374.0,
"completions/mean_length": 1286.517578125,
"completions/mean_terminated_length": 830.859130859375,
"completions/min_length": 270.0,
"completions/min_terminated_length": 270.0,
"entropy": 0.3059823364019394,
"epoch": 4.518421052631579,
"frac_reward_zero_std": 0.65625,
"grad_norm": 41.614864349365234,
"learning_rate": 1e-06,
"loss": 0.0064,
"num_tokens": 564988241.0,
"reward": 0.8157514333724976,
"reward_std": 0.08176864683628082,
"rewards/progression_diversity/mean": -0.006893161218613386,
"rewards/progression_diversity/std": 0.0425245501101017,
"rewards/symbolic_reward_accuracy/mean": 0.892578125,
"rewards/symbolic_reward_accuracy/std": 0.30995169281959534,
"rewards/symbolic_reward_partial_score/mean": 0.943359375,
"rewards/symbolic_reward_partial_score/std": 0.18591801822185516,
"rewards/tag_count_reward/mean": -0.02734375,
"rewards/tag_count_reward/std": 0.16324250400066376,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0574617385864258,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 370.0,
"sampling/sampling_logp_difference/mean": 1.9077733755111694,
"step": 1717
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.2892063856124878,
"epoch": 4.521052631578947,
"grad_norm": 0.005814536940306425,
"learning_rate": 1e-06,
"loss": 0.0526,
"step": 1718
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.29770365357398987,
"epoch": 4.523684210526316,
"grad_norm": 0.021579096093773842,
"learning_rate": 1e-06,
"loss": 0.0965,
"step": 1719
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1171875,
"entropy": 0.30083292722702026,
"epoch": 4.526315789473684,
"grad_norm": 0.016088807955384254,
"learning_rate": 1e-06,
"loss": 0.0295,
"step": 1720
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3407.0,
"completions/mean_length": 1413.296875,
"completions/mean_terminated_length": 899.1515502929688,
"completions/min_length": 286.0,
"completions/min_terminated_length": 286.0,
"entropy": 0.29098886251449585,
"epoch": 4.528947368421052,
"frac_reward_zero_std": 0.5625,
"grad_norm": 66.23799896240234,
"learning_rate": 1e-06,
"loss": 0.0473,
"num_tokens": 566137193.0,
"reward": 0.8391268253326416,
"reward_std": 0.11196374893188477,
"rewards/progression_diversity/mean": -0.008220801129937172,
"rewards/progression_diversity/std": 0.05032823607325554,
"rewards/symbolic_reward_accuracy/mean": 0.92578125,
"rewards/symbolic_reward_accuracy/std": 0.2623828947544098,
"rewards/symbolic_reward_partial_score/mean": 0.9562174081802368,
"rewards/symbolic_reward_partial_score/std": 0.18327513337135315,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0517854690551758,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 372.0,
"sampling/sampling_logp_difference/mean": 2.562303304672241,
"step": 1721
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.2914315462112427,
"epoch": 4.531578947368421,
"grad_norm": 7.667601585388184,
"learning_rate": 1e-06,
"loss": 0.043,
"step": 1722
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.140625,
"entropy": 0.2990102767944336,
"epoch": 4.534210526315789,
"grad_norm": 0.007618363946676254,
"learning_rate": 1e-06,
"loss": 0.0445,
"step": 1723
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.29170943796634674,
"epoch": 4.536842105263158,
"grad_norm": 0.028170811012387276,
"learning_rate": 1e-06,
"loss": 0.0476,
"step": 1724
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3794.0,
"completions/mean_length": 1380.935546875,
"completions/mean_terminated_length": 834.2651977539062,
"completions/min_length": 293.0,
"completions/min_terminated_length": 293.0,
"entropy": 0.2995843440294266,
"epoch": 4.5394736842105265,
"frac_reward_zero_std": 0.5625,
"grad_norm": 356.0476379394531,
"learning_rate": 1e-06,
"loss": 0.053,
"num_tokens": 567248200.0,
"reward": 0.8191208243370056,
"reward_std": 0.08875949680805206,
"rewards/progression_diversity/mean": -0.006864731200039387,
"rewards/progression_diversity/std": 0.0402359738945961,
"rewards/symbolic_reward_accuracy/mean": 0.890625,
"rewards/symbolic_reward_accuracy/std": 0.31241437792778015,
"rewards/symbolic_reward_partial_score/mean": 0.9597981572151184,
"rewards/symbolic_reward_partial_score/std": 0.14745070040225983,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0555503368377686,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 372.0,
"sampling/sampling_logp_difference/mean": 2.510345220565796,
"step": 1725
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.30750222504138947,
"epoch": 4.542105263157895,
"grad_norm": 0.007456324994564056,
"learning_rate": 1e-06,
"loss": 0.0425,
"step": 1726
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.29517829418182373,
"epoch": 4.544736842105263,
"grad_norm": 0.017573416233062744,
"learning_rate": 1e-06,
"loss": 0.0846,
"step": 1727
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.3013046234846115,
"epoch": 4.5473684210526315,
"grad_norm": 0.003968199715018272,
"learning_rate": 1e-06,
"loss": 0.0405,
"step": 1728
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3952.0,
"completions/mean_length": 1314.697265625,
"completions/mean_terminated_length": 859.8892822265625,
"completions/min_length": 248.0,
"completions/min_terminated_length": 248.0,
"entropy": 0.2868567556142807,
"epoch": 4.55,
"frac_reward_zero_std": 0.5625,
"grad_norm": 259.5559387207031,
"learning_rate": 1e-06,
"loss": 0.0893,
"num_tokens": 568328301.0,
"reward": 0.8367007374763489,
"reward_std": 0.10205796360969543,
"rewards/progression_diversity/mean": -0.006686838809400797,
"rewards/progression_diversity/std": 0.0429992638528347,
"rewards/symbolic_reward_accuracy/mean": 0.919921875,
"rewards/symbolic_reward_accuracy/std": 0.271679550409317,
"rewards/symbolic_reward_partial_score/mean": 0.9591470956802368,
"rewards/symbolic_reward_partial_score/std": 0.17069299519062042,
"rewards/tag_count_reward/mean": -0.029296875,
"rewards/tag_count_reward/std": 0.16880230605602264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.058121681213379,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 372.0,
"sampling/sampling_logp_difference/mean": 2.003477096557617,
"step": 1729
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.29739847779273987,
"epoch": 4.552631578947368,
"grad_norm": 0.022392338141798973,
"learning_rate": 1e-06,
"loss": 0.0507,
"step": 1730
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.30387774109840393,
"epoch": 4.5552631578947365,
"grad_norm": 0.00769463088363409,
"learning_rate": 1e-06,
"loss": 0.0124,
"step": 1731
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.2966042011976242,
"epoch": 4.557894736842105,
"grad_norm": 0.012574239633977413,
"learning_rate": 1e-06,
"loss": 0.0445,
"step": 1732
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3764.0,
"completions/mean_length": 1427.546875,
"completions/mean_terminated_length": 851.1318359375,
"completions/min_length": 240.0,
"completions/min_terminated_length": 240.0,
"entropy": 0.30412548780441284,
"epoch": 4.560526315789474,
"frac_reward_zero_std": 0.59375,
"grad_norm": 225.99496459960938,
"learning_rate": 1e-06,
"loss": 0.0302,
"num_tokens": 569455493.0,
"reward": 0.826937198638916,
"reward_std": 0.10923168063163757,
"rewards/progression_diversity/mean": -0.006476983428001404,
"rewards/progression_diversity/std": 0.03768179193139076,
"rewards/symbolic_reward_accuracy/mean": 0.91015625,
"rewards/symbolic_reward_accuracy/std": 0.2862374484539032,
"rewards/symbolic_reward_partial_score/mean": 0.9480794072151184,
"rewards/symbolic_reward_partial_score/std": 0.20172148942947388,
"rewards/tag_count_reward/mean": -0.03515625,
"rewards/tag_count_reward/std": 0.1843547374010086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0539195537567139,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 372.0,
"sampling/sampling_logp_difference/mean": 2.804877281188965,
"step": 1733
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.2931108772754669,
"epoch": 4.563157894736842,
"grad_norm": 0.008534945547580719,
"learning_rate": 1e-06,
"loss": 0.0567,
"step": 1734
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.27719876170158386,
"epoch": 4.565789473684211,
"grad_norm": 0.015061067417263985,
"learning_rate": 1e-06,
"loss": 0.1137,
"step": 1735
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.2918764352798462,
"epoch": 4.568421052631579,
"grad_norm": 0.010875429026782513,
"learning_rate": 1e-06,
"loss": 0.0713,
"step": 1736
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009765625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3546.0,
"completions/mean_length": 957.4921875,
"completions/mean_terminated_length": 805.3569946289062,
"completions/min_length": 212.0,
"completions/min_terminated_length": 212.0,
"entropy": 0.31101664900779724,
"epoch": 4.571052631578947,
"frac_reward_zero_std": 0.6875,
"grad_norm": 0.010922472923994064,
"learning_rate": 1e-06,
"loss": -0.0038,
"num_tokens": 570348193.0,
"reward": 0.875369131565094,
"reward_std": 0.06468972563743591,
"rewards/progression_diversity/mean": -0.0021541740279644728,
"rewards/progression_diversity/std": 0.023635899648070335,
"rewards/symbolic_reward_accuracy/mean": 0.96875,
"rewards/symbolic_reward_accuracy/std": 0.17416280508041382,
"rewards/symbolic_reward_partial_score/mean": 0.9837239980697632,
"rewards/symbolic_reward_partial_score/std": 0.10963507741689682,
"rewards/tag_count_reward/mean": -0.009765625,
"rewards/tag_count_reward/std": 0.09843364357948303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0634751319885254,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 372.0,
"sampling/sampling_logp_difference/mean": 1.4045908451080322,
"step": 1737
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1328125,
"entropy": 0.310005784034729,
"epoch": 4.573684210526316,
"grad_norm": 0.012388059869408607,
"learning_rate": 1e-06,
"loss": -0.0014,
"step": 1738
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.30088475346565247,
"epoch": 4.576315789473684,
"grad_norm": 0.002842206507921219,
"learning_rate": 1e-06,
"loss": 0.0399,
"step": 1739
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.3011449873447418,
"epoch": 4.578947368421053,
"grad_norm": 0.03648176044225693,
"learning_rate": 1e-06,
"loss": 0.087,
"step": 1740
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3054.0,
"completions/mean_length": 1486.138671875,
"completions/mean_terminated_length": 911.9817504882812,
"completions/min_length": 227.0,
"completions/min_terminated_length": 227.0,
"entropy": 0.2893022149801254,
"epoch": 4.581578947368421,
"frac_reward_zero_std": 0.5,
"grad_norm": 198.0384521484375,
"learning_rate": 1e-06,
"loss": 0.0774,
"num_tokens": 571519400.0,
"reward": 0.8131746053695679,
"reward_std": 0.091860830783844,
"rewards/progression_diversity/mean": -0.005787876434624195,
"rewards/progression_diversity/std": 0.035509541630744934,
"rewards/symbolic_reward_accuracy/mean": 0.884765625,
"rewards/symbolic_reward_accuracy/std": 0.3196168541908264,
"rewards/symbolic_reward_partial_score/mean": 0.95166015625,
"rewards/symbolic_reward_partial_score/std": 0.1652219146490097,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0542500019073486,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 372.0,
"sampling/sampling_logp_difference/mean": 2.839118003845215,
"step": 1741
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.28446468710899353,
"epoch": 4.58421052631579,
"grad_norm": 0.019028333947062492,
"learning_rate": 1e-06,
"loss": 0.1049,
"step": 1742
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.2950809597969055,
"epoch": 4.586842105263158,
"grad_norm": 0.010133459232747555,
"learning_rate": 1e-06,
"loss": 0.0021,
"step": 1743
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.28551898896694183,
"epoch": 4.589473684210526,
"grad_norm": 0.449716717004776,
"learning_rate": 1e-06,
"loss": 0.0884,
"step": 1744
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4694.0,
"completions/mean_length": 1332.86328125,
"completions/mean_terminated_length": 1002.399169921875,
"completions/min_length": 284.0,
"completions/min_terminated_length": 284.0,
"entropy": 0.29140228033065796,
"epoch": 4.592105263157895,
"frac_reward_zero_std": 0.59375,
"grad_norm": 71.57343292236328,
"learning_rate": 1e-06,
"loss": 0.0489,
"num_tokens": 572641506.0,
"reward": 0.8343420624732971,
"reward_std": 0.097034752368927,
"rewards/progression_diversity/mean": -0.0032937643118202686,
"rewards/progression_diversity/std": 0.02761550061404705,
"rewards/symbolic_reward_accuracy/mean": 0.9140625,
"rewards/symbolic_reward_accuracy/std": 0.28054583072662354,
"rewards/symbolic_reward_partial_score/mean": 0.958984375,
"rewards/symbolic_reward_partial_score/std": 0.16060270369052887,
"rewards/tag_count_reward/mean": -0.017578125,
"rewards/tag_count_reward/std": 0.13154059648513794,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0581992864608765,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 372.0,
"sampling/sampling_logp_difference/mean": 1.9309183359146118,
"step": 1745
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.297273188829422,
"epoch": 4.594736842105263,
"grad_norm": 0.005121263209730387,
"learning_rate": 1e-06,
"loss": 0.0292,
"step": 1746
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.27994534373283386,
"epoch": 4.597368421052631,
"grad_norm": 0.008685395121574402,
"learning_rate": 1e-06,
"loss": 0.0841,
"step": 1747
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.29429104924201965,
"epoch": 4.6,
"grad_norm": 0.004201680421829224,
"learning_rate": 1e-06,
"loss": 0.018,
"step": 1748
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3964.0,
"completions/mean_length": 1609.158203125,
"completions/mean_terminated_length": 1008.5548706054688,
"completions/min_length": 275.0,
"completions/min_terminated_length": 275.0,
"entropy": 0.27888044714927673,
"epoch": 4.602631578947369,
"frac_reward_zero_std": 0.65625,
"grad_norm": 37.15717697143555,
"learning_rate": 1e-06,
"loss": 0.0756,
"num_tokens": 573900403.0,
"reward": 0.8299179673194885,
"reward_std": 0.08020342141389847,
"rewards/progression_diversity/mean": -0.006251877639442682,
"rewards/progression_diversity/std": 0.038109783083200455,
"rewards/symbolic_reward_accuracy/mean": 0.9140625,
"rewards/symbolic_reward_accuracy/std": 0.28054583072662354,
"rewards/symbolic_reward_partial_score/mean": 0.9501953125,
"rewards/symbolic_reward_partial_score/std": 0.183476984500885,
"rewards/tag_count_reward/mean": -0.03515625,
"rewards/tag_count_reward/std": 0.1843547374010086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0542418956756592,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 374.0,
"sampling/sampling_logp_difference/mean": 2.3596372604370117,
"step": 1749
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.27801230549812317,
"epoch": 4.605263157894737,
"grad_norm": 213.15374755859375,
"learning_rate": 1e-06,
"loss": 0.1129,
"step": 1750
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.2716394513845444,
"epoch": 4.6078947368421055,
"grad_norm": 0.006218337453901768,
"learning_rate": 1e-06,
"loss": 0.1002,
"step": 1751
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.140625,
"entropy": 0.2844415009021759,
"epoch": 4.610526315789474,
"grad_norm": 0.013139521703124046,
"learning_rate": 1e-06,
"loss": 0.0223,
"step": 1752
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3099.0,
"completions/mean_length": 1402.400390625,
"completions/mean_terminated_length": 919.1229858398438,
"completions/min_length": 260.0,
"completions/min_terminated_length": 260.0,
"entropy": 0.2805843651294708,
"epoch": 4.613157894736842,
"frac_reward_zero_std": 0.625,
"grad_norm": 392.7408142089844,
"learning_rate": 1e-06,
"loss": 0.0717,
"num_tokens": 575023424.0,
"reward": 0.8217633366584778,
"reward_std": 0.07674242556095123,
"rewards/progression_diversity/mean": -0.00628986582159996,
"rewards/progression_diversity/std": 0.040007736533880234,
"rewards/symbolic_reward_accuracy/mean": 0.90234375,
"rewards/symbolic_reward_accuracy/std": 0.29713961482048035,
"rewards/symbolic_reward_partial_score/mean": 0.9444986581802368,
"rewards/symbolic_reward_partial_score/std": 0.19026826322078705,
"rewards/tag_count_reward/mean": -0.029296875,
"rewards/tag_count_reward/std": 0.16880230605602264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0610547065734863,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 376.0,
"sampling/sampling_logp_difference/mean": 1.5220887660980225,
"step": 1753
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.2966681718826294,
"epoch": 4.61578947368421,
"grad_norm": 0.011541535146534443,
"learning_rate": 1e-06,
"loss": 0.0328,
"step": 1754
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.2979959547519684,
"epoch": 4.618421052631579,
"grad_norm": 0.007087912876158953,
"learning_rate": 1e-06,
"loss": 0.0277,
"step": 1755
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.29420797526836395,
"epoch": 4.621052631578947,
"grad_norm": 0.012560022063553333,
"learning_rate": 1e-06,
"loss": 0.0552,
"step": 1756
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021484375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4188.0,
"completions/mean_length": 1243.05859375,
"completions/mean_terminated_length": 910.6227416992188,
"completions/min_length": 268.0,
"completions/min_terminated_length": 268.0,
"entropy": 0.3041873723268509,
"epoch": 4.623684210526315,
"frac_reward_zero_std": 0.59375,
"grad_norm": 191.67491149902344,
"learning_rate": 1e-06,
"loss": 0.0741,
"num_tokens": 576051486.0,
"reward": 0.8431257009506226,
"reward_std": 0.0861952155828476,
"rewards/progression_diversity/mean": -0.0038361886981874704,
"rewards/progression_diversity/std": 0.03068256378173828,
"rewards/symbolic_reward_accuracy/mean": 0.92578125,
"rewards/symbolic_reward_accuracy/std": 0.2623828947544098,
"rewards/symbolic_reward_partial_score/mean": 0.9654947519302368,
"rewards/symbolic_reward_partial_score/std": 0.15342977643013,
"rewards/tag_count_reward/mean": -0.01953125,
"rewards/tag_count_reward/std": 0.1385180652141571,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0610965490341187,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 376.0,
"sampling/sampling_logp_difference/mean": 1.342087984085083,
"step": 1757
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.2971126586198807,
"epoch": 4.626315789473685,
"grad_norm": 0.01794944889843464,
"learning_rate": 1e-06,
"loss": 0.0504,
"step": 1758
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.305801659822464,
"epoch": 4.628947368421053,
"grad_norm": 0.012613825500011444,
"learning_rate": 1e-06,
"loss": 0.0083,
"step": 1759
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.30587300658226013,
"epoch": 4.631578947368421,
"grad_norm": 0.022545624524354935,
"learning_rate": 1e-06,
"loss": 0.0683,
"step": 1760
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3825.0,
"completions/mean_length": 1357.8828125,
"completions/mean_terminated_length": 935.4617919921875,
"completions/min_length": 294.0,
"completions/min_terminated_length": 294.0,
"entropy": 0.30523401498794556,
"epoch": 4.63421052631579,
"frac_reward_zero_std": 0.71875,
"grad_norm": 0.04670589789748192,
"learning_rate": 1e-06,
"loss": 0.0298,
"num_tokens": 577145698.0,
"reward": 0.8240532279014587,
"reward_std": 0.07884141802787781,
"rewards/progression_diversity/mean": -0.006792136933654547,
"rewards/progression_diversity/std": 0.047225385904312134,
"rewards/symbolic_reward_accuracy/mean": 0.904296875,
"rewards/symbolic_reward_accuracy/std": 0.2944713830947876,
"rewards/symbolic_reward_partial_score/mean": 0.9475911259651184,
"rewards/symbolic_reward_partial_score/std": 0.18192888796329498,
"rewards/tag_count_reward/mean": -0.02734375,
"rewards/tag_count_reward/std": 0.16324250400066376,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0601003170013428,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 376.0,
"sampling/sampling_logp_difference/mean": 1.485578179359436,
"step": 1761
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.29327723383903503,
"epoch": 4.636842105263158,
"grad_norm": 0.01936684362590313,
"learning_rate": 1e-06,
"loss": 0.0702,
"step": 1762
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1171875,
"entropy": 0.29366083443164825,
"epoch": 4.639473684210526,
"grad_norm": 0.010277163237333298,
"learning_rate": 1e-06,
"loss": 0.0261,
"step": 1763
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0546875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.109375,
"entropy": 0.29086172580718994,
"epoch": 4.6421052631578945,
"grad_norm": 0.008378474041819572,
"learning_rate": 1e-06,
"loss": 0.0463,
"step": 1764
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3414.0,
"completions/mean_length": 1404.08203125,
"completions/mean_terminated_length": 889.6202392578125,
"completions/min_length": 245.0,
"completions/min_terminated_length": 245.0,
"entropy": 0.30439208447933197,
"epoch": 4.644736842105263,
"frac_reward_zero_std": 0.59375,
"grad_norm": 203.2693328857422,
"learning_rate": 1e-06,
"loss": 0.0463,
"num_tokens": 578260716.0,
"reward": 0.8370853662490845,
"reward_std": 0.09505656361579895,
"rewards/progression_diversity/mean": -0.007285004947334528,
"rewards/progression_diversity/std": 0.046391766518354416,
"rewards/symbolic_reward_accuracy/mean": 0.91796875,
"rewards/symbolic_reward_accuracy/std": 0.2746807038784027,
"rewards/symbolic_reward_partial_score/mean": 0.9656575322151184,
"rewards/symbolic_reward_partial_score/std": 0.1476442664861679,
"rewards/tag_count_reward/mean": -0.033203125,
"rewards/tag_count_reward/std": 0.17934183776378632,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0571269989013672,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 378.0,
"sampling/sampling_logp_difference/mean": 2.06299090385437,
"step": 1765
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.29458560049533844,
"epoch": 4.647368421052631,
"grad_norm": 0.011282769963145256,
"learning_rate": 1e-06,
"loss": 0.0713,
"step": 1766
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.140625,
"entropy": 0.2905200123786926,
"epoch": 4.65,
"grad_norm": 5.091102600097656,
"learning_rate": 1e-06,
"loss": 0.0594,
"step": 1767
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.2920839935541153,
"epoch": 4.652631578947369,
"grad_norm": 0.00980228092521429,
"learning_rate": 1e-06,
"loss": 0.0531,
"step": 1768
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12152.0,
"completions/mean_length": 1461.2734375,
"completions/mean_terminated_length": 979.8951416015625,
"completions/min_length": 284.0,
"completions/min_terminated_length": 284.0,
"entropy": 0.3031252771615982,
"epoch": 4.655263157894737,
"frac_reward_zero_std": 0.59375,
"grad_norm": 0.020676128566265106,
"learning_rate": 1e-06,
"loss": 0.0366,
"num_tokens": 579421880.0,
"reward": 0.8634036183357239,
"reward_std": 0.1007419154047966,
"rewards/progression_diversity/mean": -0.007299537770450115,
"rewards/progression_diversity/std": 0.04725031182169914,
"rewards/symbolic_reward_accuracy/mean": 0.958984375,
"rewards/symbolic_reward_accuracy/std": 0.19852031767368317,
"rewards/symbolic_reward_partial_score/mean": 0.9700520634651184,
"rewards/symbolic_reward_partial_score/std": 0.16211473941802979,
"rewards/tag_count_reward/mean": -0.029296875,
"rewards/tag_count_reward/std": 0.16880230605602264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0525739192962646,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 380.0,
"sampling/sampling_logp_difference/mean": 2.551374912261963,
"step": 1769
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.28818726539611816,
"epoch": 4.657894736842105,
"grad_norm": 0.013333577662706375,
"learning_rate": 1e-06,
"loss": 0.0613,
"step": 1770
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.2797438055276871,
"epoch": 4.660526315789474,
"grad_norm": 12.496295928955078,
"learning_rate": 1e-06,
"loss": 0.089,
"step": 1771
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.29146166145801544,
"epoch": 4.663157894736842,
"grad_norm": 7.178304672241211,
"learning_rate": 1e-06,
"loss": 0.0752,
"step": 1772
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3135.0,
"completions/mean_length": 1521.27734375,
"completions/mean_terminated_length": 1072.7042236328125,
"completions/min_length": 260.0,
"completions/min_terminated_length": 260.0,
"entropy": 0.28964005410671234,
"epoch": 4.66578947368421,
"frac_reward_zero_std": 0.4375,
"grad_norm": 328.8640441894531,
"learning_rate": 1e-06,
"loss": 0.1406,
"num_tokens": 580629446.0,
"reward": 0.8446449041366577,
"reward_std": 0.14509400725364685,
"rewards/progression_diversity/mean": -0.008165711537003517,
"rewards/progression_diversity/std": 0.05180385336279869,
"rewards/symbolic_reward_accuracy/mean": 0.931640625,
"rewards/symbolic_reward_accuracy/std": 0.25260838866233826,
"rewards/symbolic_reward_partial_score/mean": 0.9622395634651184,
"rewards/symbolic_reward_partial_score/std": 0.16547498106956482,
"rewards/tag_count_reward/mean": -0.029296875,
"rewards/tag_count_reward/std": 0.16880230605602264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0513108968734741,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 380.0,
"sampling/sampling_logp_difference/mean": 2.6449103355407715,
"step": 1773
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.2889109253883362,
"epoch": 4.668421052631579,
"grad_norm": 0.018843237310647964,
"learning_rate": 1e-06,
"loss": 0.048,
"step": 1774
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.28207381069660187,
"epoch": 4.671052631578947,
"grad_norm": 0.029016956686973572,
"learning_rate": 1e-06,
"loss": 0.0446,
"step": 1775
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.28926801681518555,
"epoch": 4.673684210526316,
"grad_norm": 0.013139498420059681,
"learning_rate": 1e-06,
"loss": 0.0746,
"step": 1776
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3197.0,
"completions/mean_length": 1161.837890625,
"completions/mean_terminated_length": 858.6076049804688,
"completions/min_length": 270.0,
"completions/min_terminated_length": 270.0,
"entropy": 0.2987944483757019,
"epoch": 4.676315789473684,
"frac_reward_zero_std": 0.65625,
"grad_norm": 314.06134033203125,
"learning_rate": 1e-06,
"loss": 0.0431,
"num_tokens": 581621939.0,
"reward": 0.8670340776443481,
"reward_std": 0.08653353154659271,
"rewards/progression_diversity/mean": -0.005582699552178383,
"rewards/progression_diversity/std": 0.042026419192552567,
"rewards/symbolic_reward_accuracy/mean": 0.958984375,
"rewards/symbolic_reward_accuracy/std": 0.19852031767368317,
"rewards/symbolic_reward_partial_score/mean": 0.9788411259651184,
"rewards/symbolic_reward_partial_score/std": 0.12187561392784119,
"rewards/tag_count_reward/mean": -0.01953125,
"rewards/tag_count_reward/std": 0.1385180652141571,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.05836021900177,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 380.0,
"sampling/sampling_logp_difference/mean": 2.0134835243225098,
"step": 1777
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.312513142824173,
"epoch": 4.678947368421053,
"grad_norm": 0.004544838331639767,
"learning_rate": 1e-06,
"loss": -0.0029,
"step": 1778
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.28765228390693665,
"epoch": 4.681578947368421,
"grad_norm": 0.00618336908519268,
"learning_rate": 1e-06,
"loss": 0.0752,
"step": 1779
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.29780954122543335,
"epoch": 4.684210526315789,
"grad_norm": 0.003846309846267104,
"learning_rate": 1e-06,
"loss": 0.0645,
"step": 1780
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3127.0,
"completions/mean_length": 1343.55078125,
"completions/mean_terminated_length": 858.375,
"completions/min_length": 266.0,
"completions/min_terminated_length": 266.0,
"entropy": 0.29853370785713196,
"epoch": 4.686842105263158,
"frac_reward_zero_std": 0.65625,
"grad_norm": 134.65643310546875,
"learning_rate": 1e-06,
"loss": 0.0479,
"num_tokens": 582688941.0,
"reward": 0.8453360795974731,
"reward_std": 0.08855551481246948,
"rewards/progression_diversity/mean": -0.007415060419589281,
"rewards/progression_diversity/std": 0.04754326492547989,
"rewards/symbolic_reward_accuracy/mean": 0.935546875,
"rewards/symbolic_reward_accuracy/std": 0.24579854309558868,
"rewards/symbolic_reward_partial_score/mean": 0.9573568105697632,
"rewards/symbolic_reward_partial_score/std": 0.1826906055212021,
"rewards/tag_count_reward/mean": -0.03125,
"rewards/tag_count_reward/std": 0.17416280508041382,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0548827648162842,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 380.0,
"sampling/sampling_logp_difference/mean": 2.597440242767334,
"step": 1781
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.30938100814819336,
"epoch": 4.689473684210526,
"grad_norm": 0.007459544111043215,
"learning_rate": 1e-06,
"loss": 0.0288,
"step": 1782
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.140625,
"entropy": 0.2973015457391739,
"epoch": 4.692105263157895,
"grad_norm": 0.013295495882630348,
"learning_rate": 1e-06,
"loss": 0.0913,
"step": 1783
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.29738104343414307,
"epoch": 4.6947368421052635,
"grad_norm": 0.008674710988998413,
"learning_rate": 1e-06,
"loss": 0.1016,
"step": 1784
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.048828125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8991.0,
"completions/mean_length": 1652.69921875,
"completions/mean_terminated_length": 896.4722900390625,
"completions/min_length": 231.0,
"completions/min_terminated_length": 231.0,
"entropy": 0.2916813790798187,
"epoch": 4.697368421052632,
"frac_reward_zero_std": 0.53125,
"grad_norm": 311.70086669921875,
"learning_rate": 1e-06,
"loss": 0.1287,
"num_tokens": 583923411.0,
"reward": 0.816964864730835,
"reward_std": 0.0918387770652771,
"rewards/progression_diversity/mean": -0.00762150390073657,
"rewards/progression_diversity/std": 0.04115252196788788,
"rewards/symbolic_reward_accuracy/mean": 0.896484375,
"rewards/symbolic_reward_accuracy/std": 0.30492907762527466,
"rewards/symbolic_reward_partial_score/mean": 0.9454752802848816,
"rewards/symbolic_reward_partial_score/std": 0.191688671708107,
"rewards/tag_count_reward/mean": -0.044921875,
"rewards/tag_count_reward/std": 0.20733514428138733,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0515938997268677,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 380.0,
"sampling/sampling_logp_difference/mean": 2.775969982147217,
"step": 1785
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.2890394926071167,
"epoch": 4.7,
"grad_norm": 0.01550104096531868,
"learning_rate": 1e-06,
"loss": 0.1227,
"step": 1786
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.2989995628595352,
"epoch": 4.7026315789473685,
"grad_norm": 0.008528691716492176,
"learning_rate": 1e-06,
"loss": 0.0246,
"step": 1787
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.29257068037986755,
"epoch": 4.705263157894737,
"grad_norm": 0.010270673781633377,
"learning_rate": 1e-06,
"loss": 0.0584,
"step": 1788
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3506.0,
"completions/mean_length": 1359.19140625,
"completions/mean_terminated_length": 905.726318359375,
"completions/min_length": 279.0,
"completions/min_terminated_length": 279.0,
"entropy": 0.3098142296075821,
"epoch": 4.707894736842105,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.009268060326576233,
"learning_rate": 1e-06,
"loss": -0.0146,
"num_tokens": 585016949.0,
"reward": 0.8493025302886963,
"reward_std": 0.07921440899372101,
"rewards/progression_diversity/mean": -0.006267758086323738,
"rewards/progression_diversity/std": 0.039985544979572296,
"rewards/symbolic_reward_accuracy/mean": 0.9375,
"rewards/symbolic_reward_accuracy/std": 0.2422981858253479,
"rewards/symbolic_reward_partial_score/mean": 0.9646809697151184,
"rewards/symbolic_reward_partial_score/std": 0.15783125162124634,
"rewards/tag_count_reward/mean": -0.025390625,
"rewards/tag_count_reward/std": 0.15746226906776428,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.058639645576477,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 380.0,
"sampling/sampling_logp_difference/mean": 1.931031346321106,
"step": 1789
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.3012120723724365,
"epoch": 4.7105263157894735,
"grad_norm": 0.018348556011915207,
"learning_rate": 1e-06,
"loss": 0.0325,
"step": 1790
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.3086209297180176,
"epoch": 4.713157894736842,
"grad_norm": 0.03524219989776611,
"learning_rate": 1e-06,
"loss": 0.0427,
"step": 1791
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.29122862219810486,
"epoch": 4.715789473684211,
"grad_norm": 0.004257425665855408,
"learning_rate": 1e-06,
"loss": 0.1065,
"step": 1792
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.044921875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3446.0,
"completions/mean_length": 1595.56640625,
"completions/mean_terminated_length": 899.9959106445312,
"completions/min_length": 241.0,
"completions/min_terminated_length": 241.0,
"entropy": 0.287975549697876,
"epoch": 4.718421052631579,
"frac_reward_zero_std": 0.5,
"grad_norm": 392.2756042480469,
"learning_rate": 1e-06,
"loss": 0.1354,
"num_tokens": 586220343.0,
"reward": 0.8267496824264526,
"reward_std": 0.13566027581691742,
"rewards/progression_diversity/mean": -0.010585745796561241,
"rewards/progression_diversity/std": 0.05752396956086159,
"rewards/symbolic_reward_accuracy/mean": 0.912109375,
"rewards/symbolic_reward_accuracy/std": 0.2834126651287079,
"rewards/symbolic_reward_partial_score/mean": 0.9462890625,
"rewards/symbolic_reward_partial_score/std": 0.20108237862586975,
"rewards/tag_count_reward/mean": -0.04296875,
"rewards/tag_count_reward/std": 0.2029850035905838,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.048526644706726,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 380.0,
"sampling/sampling_logp_difference/mean": 3.1703460216522217,
"step": 1793
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.29304662346839905,
"epoch": 4.721052631578948,
"grad_norm": 282.28704833984375,
"learning_rate": 1e-06,
"loss": 0.1636,
"step": 1794
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.29584261775016785,
"epoch": 4.723684210526316,
"grad_norm": 0.01929856278002262,
"learning_rate": 1e-06,
"loss": 0.1043,
"step": 1795
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.30293792486190796,
"epoch": 4.726315789473684,
"grad_norm": 0.007616270799189806,
"learning_rate": 1e-06,
"loss": 0.0534,
"step": 1796
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3592.0,
"completions/mean_length": 1216.298828125,
"completions/mean_terminated_length": 914.1534423828125,
"completions/min_length": 211.0,
"completions/min_terminated_length": 211.0,
"entropy": 0.2999269813299179,
"epoch": 4.728947368421053,
"frac_reward_zero_std": 0.65625,
"grad_norm": 63.879966735839844,
"learning_rate": 1e-06,
"loss": 0.0632,
"num_tokens": 587246896.0,
"reward": 0.8688350915908813,
"reward_std": 0.0851588174700737,
"rewards/progression_diversity/mean": -0.0061400942504405975,
"rewards/progression_diversity/std": 0.04974029213190079,
"rewards/symbolic_reward_accuracy/mean": 0.962890625,
"rewards/symbolic_reward_accuracy/std": 0.18921469151973724,
"rewards/symbolic_reward_partial_score/mean": 0.97705078125,
"rewards/symbolic_reward_partial_score/std": 0.13179610669612885,
"rewards/tag_count_reward/mean": -0.01953125,
"rewards/tag_count_reward/std": 0.1385180652141571,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0599864721298218,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 382.0,
"sampling/sampling_logp_difference/mean": 1.3513113260269165,
"step": 1797
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.30788634717464447,
"epoch": 4.731578947368421,
"grad_norm": 0.03443191573023796,
"learning_rate": 1e-06,
"loss": 0.0174,
"step": 1798
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.2986367344856262,
"epoch": 4.734210526315789,
"grad_norm": 0.0047141476534307,
"learning_rate": 1e-06,
"loss": 0.6458,
"step": 1799
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.140625,
"entropy": 0.29760782420635223,
"epoch": 4.7368421052631575,
"grad_norm": 0.0066524529829621315,
"learning_rate": 1e-06,
"loss": 0.0348,
"step": 1800
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3175.0,
"completions/mean_length": 2149.83984375,
"completions/mean_terminated_length": 1008.70458984375,
"completions/min_length": 265.0,
"completions/min_terminated_length": 265.0,
"entropy": 0.28144165873527527,
"epoch": 4.739473684210527,
"frac_reward_zero_std": 0.5625,
"grad_norm": 226.54739379882812,
"learning_rate": 1e-06,
"loss": 0.083,
"num_tokens": 588753086.0,
"reward": 0.779984712600708,
"reward_std": 0.13797515630722046,
"rewards/progression_diversity/mean": -0.019111623987555504,
"rewards/progression_diversity/std": 0.0777515172958374,
"rewards/symbolic_reward_accuracy/mean": 0.849609375,
"rewards/symbolic_reward_accuracy/std": 0.35780346393585205,
"rewards/symbolic_reward_partial_score/mean": 0.9261067509651184,
"rewards/symbolic_reward_partial_score/std": 0.22488640248775482,
"rewards/tag_count_reward/mean": -0.07421875,
"rewards/tag_count_reward/std": 0.2623828947544098,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0423637628555298,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 384.0,
"sampling/sampling_logp_difference/mean": 3.5376710891723633,
"step": 1801
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.2867640554904938,
"epoch": 4.742105263157895,
"grad_norm": 0.025515422224998474,
"learning_rate": 1e-06,
"loss": 0.1148,
"step": 1802
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.27136462926864624,
"epoch": 4.744736842105263,
"grad_norm": 0.008495149202644825,
"learning_rate": 1e-06,
"loss": 0.1537,
"step": 1803
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.2758960574865341,
"epoch": 4.747368421052632,
"grad_norm": 0.33327996730804443,
"learning_rate": 1e-06,
"loss": 0.0768,
"step": 1804
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9181.0,
"completions/mean_length": 1735.49609375,
"completions/mean_terminated_length": 1077.80810546875,
"completions/min_length": 242.0,
"completions/min_terminated_length": 242.0,
"entropy": 0.2854105681180954,
"epoch": 4.75,
"frac_reward_zero_std": 0.375,
"grad_norm": 10.029664039611816,
"learning_rate": 1e-06,
"loss": 0.0718,
"num_tokens": 590060988.0,
"reward": 0.8515157699584961,
"reward_std": 0.12311355024576187,
"rewards/progression_diversity/mean": -0.014438299462199211,
"rewards/progression_diversity/std": 0.07580766081809998,
"rewards/symbolic_reward_accuracy/mean": 0.943359375,
"rewards/symbolic_reward_accuracy/std": 0.23138070106506348,
"rewards/symbolic_reward_partial_score/mean": 0.9671223759651184,
"rewards/symbolic_reward_partial_score/std": 0.15571942925453186,
"rewards/tag_count_reward/mean": -0.044921875,
"rewards/tag_count_reward/std": 0.20733514428138733,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.04230797290802,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 386.0,
"sampling/sampling_logp_difference/mean": 3.432555675506592,
"step": 1805
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.2759237736463547,
"epoch": 4.752631578947368,
"grad_norm": 1959.9281005859375,
"learning_rate": 1e-06,
"loss": 0.2892,
"step": 1806
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.28651775419712067,
"epoch": 4.755263157894737,
"grad_norm": 352.3994140625,
"learning_rate": 1e-06,
"loss": 0.1454,
"step": 1807
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.2970658540725708,
"epoch": 4.757894736842105,
"grad_norm": 0.0121544124558568,
"learning_rate": 1e-06,
"loss": 0.0625,
"step": 1808
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.064453125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3052.0,
"completions/mean_length": 1934.1640625,
"completions/mean_terminated_length": 938.6638793945312,
"completions/min_length": 279.0,
"completions/min_terminated_length": 279.0,
"entropy": 0.27560150623321533,
"epoch": 4.760526315789473,
"frac_reward_zero_std": 0.40625,
"grad_norm": 242.6947479248047,
"learning_rate": 1e-06,
"loss": 0.1073,
"num_tokens": 591441072.0,
"reward": 0.8055234551429749,
"reward_std": 0.10336482524871826,
"rewards/progression_diversity/mean": -0.01895313523709774,
"rewards/progression_diversity/std": 0.08070053160190582,
"rewards/symbolic_reward_accuracy/mean": 0.8828125,
"rewards/symbolic_reward_accuracy/std": 0.32195815443992615,
"rewards/symbolic_reward_partial_score/mean": 0.93896484375,
"rewards/symbolic_reward_partial_score/std": 0.19326075911521912,
"rewards/tag_count_reward/mean": -0.056640625,
"rewards/tag_count_reward/std": 0.23138070106506348,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0425719022750854,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 388.0,
"sampling/sampling_logp_difference/mean": 3.5746545791625977,
"step": 1809
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.28119949996471405,
"epoch": 4.7631578947368425,
"grad_norm": 0.02790587767958641,
"learning_rate": 1e-06,
"loss": 0.1284,
"step": 1810
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.2921789288520813,
"epoch": 4.765789473684211,
"grad_norm": 0.060401126742362976,
"learning_rate": 1e-06,
"loss": 0.0706,
"step": 1811
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.28399544954299927,
"epoch": 4.768421052631579,
"grad_norm": 0.10208897292613983,
"learning_rate": 1e-06,
"loss": 0.0961,
"step": 1812
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3719.0,
"completions/mean_length": 1791.84375,
"completions/mean_terminated_length": 819.0333862304688,
"completions/min_length": 274.0,
"completions/min_terminated_length": 274.0,
"entropy": 0.2930610924959183,
"epoch": 4.771052631578947,
"frac_reward_zero_std": 0.5625,
"grad_norm": 260.9751892089844,
"learning_rate": 1e-06,
"loss": 0.0712,
"num_tokens": 592748288.0,
"reward": 0.7997059226036072,
"reward_std": 0.09009853005409241,
"rewards/progression_diversity/mean": -0.014760013669729233,
"rewards/progression_diversity/std": 0.06976296752691269,
"rewards/symbolic_reward_accuracy/mean": 0.876953125,
"rewards/symbolic_reward_accuracy/std": 0.32881227135658264,
"rewards/symbolic_reward_partial_score/mean": 0.9298502206802368,
"rewards/symbolic_reward_partial_score/std": 0.21666119992733002,
"rewards/tag_count_reward/mean": -0.052734375,
"rewards/tag_count_reward/std": 0.22372129559516907,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0533522367477417,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 388.0,
"sampling/sampling_logp_difference/mean": 2.341224431991577,
"step": 1813
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.29311226308345795,
"epoch": 4.773684210526316,
"grad_norm": 0.006627992261201143,
"learning_rate": 1e-06,
"loss": 0.0818,
"step": 1814
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.2873583137989044,
"epoch": 4.776315789473684,
"grad_norm": 0.0039825947023928165,
"learning_rate": 1e-06,
"loss": 0.0784,
"step": 1815
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.29943425953388214,
"epoch": 4.778947368421052,
"grad_norm": 0.006021894048899412,
"learning_rate": 1e-06,
"loss": 0.0369,
"step": 1816
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3120.0,
"completions/mean_length": 1341.46875,
"completions/mean_terminated_length": 949.5791625976562,
"completions/min_length": 207.0,
"completions/min_terminated_length": 207.0,
"entropy": 0.3028171956539154,
"epoch": 4.781578947368421,
"frac_reward_zero_std": 0.34375,
"grad_norm": 35.22563552856445,
"learning_rate": 1e-06,
"loss": 0.0162,
"num_tokens": 593841616.0,
"reward": 0.8557599782943726,
"reward_std": 0.1146036684513092,
"rewards/progression_diversity/mean": -0.009937912225723267,
"rewards/progression_diversity/std": 0.0624278225004673,
"rewards/symbolic_reward_accuracy/mean": 0.943359375,
"rewards/symbolic_reward_accuracy/std": 0.23138070106506348,
"rewards/symbolic_reward_partial_score/mean": 0.974609375,
"rewards/symbolic_reward_partial_score/std": 0.12061332911252975,
"rewards/tag_count_reward/mean": -0.025390625,
"rewards/tag_count_reward/std": 0.15746226906776428,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.052259922027588,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 390.0,
"sampling/sampling_logp_difference/mean": 2.46126127243042,
"step": 1817
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.2946004569530487,
"epoch": 4.784210526315789,
"grad_norm": 524.8899536132812,
"learning_rate": 1e-06,
"loss": 0.1229,
"step": 1818
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.29525619745254517,
"epoch": 4.786842105263158,
"grad_norm": 0.0062756622210145,
"learning_rate": 1e-06,
"loss": 0.0398,
"step": 1819
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.265625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.2876027226448059,
"epoch": 4.7894736842105265,
"grad_norm": 0.011834767647087574,
"learning_rate": 1e-06,
"loss": 0.072,
"step": 1820
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8261.0,
"completions/mean_length": 1432.533203125,
"completions/mean_terminated_length": 950.227783203125,
"completions/min_length": 243.0,
"completions/min_terminated_length": 243.0,
"entropy": 0.294967457652092,
"epoch": 4.792105263157895,
"frac_reward_zero_std": 0.59375,
"grad_norm": 206.98422241210938,
"learning_rate": 1e-06,
"loss": 0.0693,
"num_tokens": 594979361.0,
"reward": 0.8551140427589417,
"reward_std": 0.08930052816867828,
"rewards/progression_diversity/mean": -0.011060354299843311,
"rewards/progression_diversity/std": 0.0651150494813919,
"rewards/symbolic_reward_accuracy/mean": 0.9453125,
"rewards/symbolic_reward_accuracy/std": 0.2275916188955307,
"rewards/symbolic_reward_partial_score/mean": 0.97119140625,
"rewards/symbolic_reward_partial_score/std": 0.14763346314430237,
"rewards/tag_count_reward/mean": -0.033203125,
"rewards/tag_count_reward/std": 0.17934183776378632,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0505045652389526,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 392.0,
"sampling/sampling_logp_difference/mean": 2.7975821495056152,
"step": 1821
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.29428502917289734,
"epoch": 4.794736842105263,
"grad_norm": 0.00786570180207491,
"learning_rate": 1e-06,
"loss": 0.0222,
"step": 1822
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.2888794392347336,
"epoch": 4.7973684210526315,
"grad_norm": 0.015527124516665936,
"learning_rate": 1e-06,
"loss": 0.0965,
"step": 1823
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.2929443120956421,
"epoch": 4.8,
"grad_norm": 0.0051057226955890656,
"learning_rate": 1e-06,
"loss": 0.0823,
"step": 1824
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5839.0,
"completions/mean_length": 1806.5625,
"completions/mean_terminated_length": 963.2396240234375,
"completions/min_length": 284.0,
"completions/min_terminated_length": 284.0,
"entropy": 0.2864207327365875,
"epoch": 4.802631578947368,
"frac_reward_zero_std": 0.59375,
"grad_norm": 191.23019409179688,
"learning_rate": 1e-06,
"loss": 0.0386,
"num_tokens": 596299105.0,
"reward": 0.7930800914764404,
"reward_std": 0.09042888879776001,
"rewards/progression_diversity/mean": -0.01328323408961296,
"rewards/progression_diversity/std": 0.06475852429866791,
"rewards/symbolic_reward_accuracy/mean": 0.865234375,
"rewards/symbolic_reward_accuracy/std": 0.3418070077896118,
"rewards/symbolic_reward_partial_score/mean": 0.9265950322151184,
"rewards/symbolic_reward_partial_score/std": 0.22019346058368683,
"rewards/tag_count_reward/mean": -0.0390625,
"rewards/tag_count_reward/std": 0.1939331740140915,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0515691041946411,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 392.0,
"sampling/sampling_logp_difference/mean": 2.668024778366089,
"step": 1825
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.26454444229602814,
"epoch": 4.8052631578947365,
"grad_norm": 0.004221981856971979,
"learning_rate": 1e-06,
"loss": 0.1982,
"step": 1826
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.2992611974477768,
"epoch": 4.807894736842105,
"grad_norm": 0.009702647104859352,
"learning_rate": 1e-06,
"loss": 0.0082,
"step": 1827
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.28555387258529663,
"epoch": 4.810526315789474,
"grad_norm": 0.007640550844371319,
"learning_rate": 1e-06,
"loss": 0.0761,
"step": 1828
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 2638.0,
"completions/mean_length": 1518.830078125,
"completions/mean_terminated_length": 914.5548706054688,
"completions/min_length": 262.0,
"completions/min_terminated_length": 262.0,
"entropy": 0.2895033657550812,
"epoch": 4.813157894736842,
"frac_reward_zero_std": 0.4375,
"grad_norm": 482.5952453613281,
"learning_rate": 1e-06,
"loss": 0.0888,
"num_tokens": 597485546.0,
"reward": 0.8472241163253784,
"reward_std": 0.12179014086723328,
"rewards/progression_diversity/mean": -0.013924511149525642,
"rewards/progression_diversity/std": 0.07416489720344543,
"rewards/symbolic_reward_accuracy/mean": 0.93359375,
"rewards/symbolic_reward_accuracy/std": 0.2492343932390213,
"rewards/symbolic_reward_partial_score/mean": 0.9703775644302368,
"rewards/symbolic_reward_partial_score/std": 0.1422656625509262,
"rewards/tag_count_reward/mean": -0.0390625,
"rewards/tag_count_reward/std": 0.1939331740140915,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0476664304733276,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 392.0,
"sampling/sampling_logp_difference/mean": 3.4861273765563965,
"step": 1829
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.30734431743621826,
"epoch": 4.815789473684211,
"grad_norm": 0.01676630787551403,
"learning_rate": 1e-06,
"loss": 0.0315,
"step": 1830
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.2916211038827896,
"epoch": 4.818421052631579,
"grad_norm": 0.01112330798059702,
"learning_rate": 1e-06,
"loss": 0.0759,
"step": 1831
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.2971077710390091,
"epoch": 4.821052631578947,
"grad_norm": 0.006531843915581703,
"learning_rate": 1e-06,
"loss": 0.0779,
"step": 1832
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.041015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3569.0,
"completions/mean_length": 1641.30859375,
"completions/mean_terminated_length": 1010.7658081054688,
"completions/min_length": 273.0,
"completions/min_terminated_length": 273.0,
"entropy": 0.2911697328090668,
"epoch": 4.823684210526316,
"frac_reward_zero_std": 0.53125,
"grad_norm": 0.01622670330107212,
"learning_rate": 1e-06,
"loss": 0.0771,
"num_tokens": 598745384.0,
"reward": 0.8167889714241028,
"reward_std": 0.10516804456710815,
"rewards/progression_diversity/mean": -0.010561157017946243,
"rewards/progression_diversity/std": 0.05700741335749626,
"rewards/symbolic_reward_accuracy/mean": 0.8984375,
"rewards/symbolic_reward_accuracy/std": 0.30236753821372986,
"rewards/symbolic_reward_partial_score/mean": 0.9378255605697632,
"rewards/symbolic_reward_partial_score/std": 0.2107086479663849,
"rewards/tag_count_reward/mean": -0.03515625,
"rewards/tag_count_reward/std": 0.1843547374010086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.045763373374939,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 394.0,
"sampling/sampling_logp_difference/mean": 3.4746012687683105,
"step": 1833
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.28375144302845,
"epoch": 4.826315789473684,
"grad_norm": 0.007259514648467302,
"learning_rate": 1e-06,
"loss": 0.0821,
"step": 1834
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.28915947675704956,
"epoch": 4.828947368421053,
"grad_norm": 0.020610112696886063,
"learning_rate": 1e-06,
"loss": 0.0623,
"step": 1835
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.2865881621837616,
"epoch": 4.831578947368421,
"grad_norm": 0.014592897146940231,
"learning_rate": 1e-06,
"loss": 0.1088,
"step": 1836
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.041015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3561.0,
"completions/mean_length": 1590.58984375,
"completions/mean_terminated_length": 957.8778686523438,
"completions/min_length": 260.0,
"completions/min_terminated_length": 260.0,
"entropy": 0.29244448244571686,
"epoch": 4.83421052631579,
"frac_reward_zero_std": 0.5,
"grad_norm": 432.8992919921875,
"learning_rate": 1e-06,
"loss": 0.042,
"num_tokens": 599961590.0,
"reward": 0.8015085458755493,
"reward_std": 0.11123409122228622,
"rewards/progression_diversity/mean": -0.010282262228429317,
"rewards/progression_diversity/std": 0.05530929192900658,
"rewards/symbolic_reward_accuracy/mean": 0.87109375,
"rewards/symbolic_reward_accuracy/std": 0.33542385697364807,
"rewards/symbolic_reward_partial_score/mean": 0.9435220956802368,
"rewards/symbolic_reward_partial_score/std": 0.17923220992088318,
"rewards/tag_count_reward/mean": -0.041015625,
"rewards/tag_count_reward/std": 0.19852031767368317,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0479365587234497,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 396.0,
"sampling/sampling_logp_difference/mean": 3.16892409324646,
"step": 1837
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.28493815660476685,
"epoch": 4.836842105263158,
"grad_norm": 0.014859353192150593,
"learning_rate": 1e-06,
"loss": 0.0579,
"step": 1838
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.29050807654857635,
"epoch": 4.839473684210526,
"grad_norm": 0.00957922451198101,
"learning_rate": 1e-06,
"loss": 0.0393,
"step": 1839
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.28927716612815857,
"epoch": 4.842105263157895,
"grad_norm": 0.018363086506724358,
"learning_rate": 1e-06,
"loss": 0.0646,
"step": 1840
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4263.0,
"completions/mean_length": 961.29296875,
"completions/mean_terminated_length": 839.8543090820312,
"completions/min_length": 251.0,
"completions/min_terminated_length": 251.0,
"entropy": 0.30787861347198486,
"epoch": 4.844736842105263,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.09452533721923828,
"learning_rate": 1e-06,
"loss": 0.0309,
"num_tokens": 600826380.0,
"reward": 0.8753113746643066,
"reward_std": 0.08123890310525894,
"rewards/progression_diversity/mean": -0.0030442136339843273,
"rewards/progression_diversity/std": 0.038930658251047134,
"rewards/symbolic_reward_accuracy/mean": 0.96875,
"rewards/symbolic_reward_accuracy/std": 0.17416280508041382,
"rewards/symbolic_reward_partial_score/mean": 0.98291015625,
"rewards/symbolic_reward_partial_score/std": 0.1046241894364357,
"rewards/tag_count_reward/mean": -0.0078125,
"rewards/tag_count_reward/std": 0.08812850713729858,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.063698410987854,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 394.0,
"sampling/sampling_logp_difference/mean": 1.2699187994003296,
"step": 1841
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.31785525381565094,
"epoch": 4.847368421052631,
"grad_norm": 36.97111511230469,
"learning_rate": 1e-06,
"loss": 0.0007,
"step": 1842
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.3095642328262329,
"epoch": 4.85,
"grad_norm": 0.019375812262296677,
"learning_rate": 1e-06,
"loss": 0.0321,
"step": 1843
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.3243100643157959,
"epoch": 4.852631578947369,
"grad_norm": 0.00351691129617393,
"learning_rate": 1e-06,
"loss": -0.0018,
"step": 1844
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.060546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4850.0,
"completions/mean_length": 1875.322265625,
"completions/mean_terminated_length": 940.2515869140625,
"completions/min_length": 281.0,
"completions/min_terminated_length": 281.0,
"entropy": 0.2960711419582367,
"epoch": 4.855263157894737,
"frac_reward_zero_std": 0.5,
"grad_norm": 743.4161376953125,
"learning_rate": 1e-06,
"loss": 0.0685,
"num_tokens": 602193201.0,
"reward": 0.8039959669113159,
"reward_std": 0.11662513762712479,
"rewards/progression_diversity/mean": -0.015446132980287075,
"rewards/progression_diversity/std": 0.07263787090778351,
"rewards/symbolic_reward_accuracy/mean": 0.8828125,
"rewards/symbolic_reward_accuracy/std": 0.32195815443992615,
"rewards/symbolic_reward_partial_score/mean": 0.93310546875,
"rewards/symbolic_reward_partial_score/std": 0.21881107985973358,
"rewards/tag_count_reward/mean": -0.0546875,
"rewards/tag_count_reward/std": 0.2275916188955307,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0418115854263306,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 396.0,
"sampling/sampling_logp_difference/mean": 4.129971981048584,
"step": 1845
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.28108513355255127,
"epoch": 4.8578947368421055,
"grad_norm": 685.2288818359375,
"learning_rate": 1e-06,
"loss": 0.2488,
"step": 1846
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.28235989809036255,
"epoch": 4.860526315789474,
"grad_norm": 0.029184581711888313,
"learning_rate": 1e-06,
"loss": 0.1114,
"step": 1847
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.28090812265872955,
"epoch": 4.863157894736842,
"grad_norm": 0.2511216998100281,
"learning_rate": 1e-06,
"loss": 0.0608,
"step": 1848
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.095703125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3051.0,
"completions/mean_length": 2357.298828125,
"completions/mean_terminated_length": 872.8314819335938,
"completions/min_length": 270.0,
"completions/min_terminated_length": 270.0,
"entropy": 0.26737289130687714,
"epoch": 4.86578947368421,
"frac_reward_zero_std": 0.3125,
"grad_norm": 318.45111083984375,
"learning_rate": 1e-06,
"loss": 0.1068,
"num_tokens": 603795082.0,
"reward": 0.7866251468658447,
"reward_std": 0.13819053769111633,
"rewards/progression_diversity/mean": -0.028891239315271378,
"rewards/progression_diversity/std": 0.10002173483371735,
"rewards/symbolic_reward_accuracy/mean": 0.869140625,
"rewards/symbolic_reward_accuracy/std": 0.33757632970809937,
"rewards/symbolic_reward_partial_score/mean": 0.91015625,
"rewards/symbolic_reward_partial_score/std": 0.25601255893707275,
"rewards/tag_count_reward/mean": -0.076171875,
"rewards/tag_count_reward/std": 0.26553234457969666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0239213705062866,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 396.0,
"sampling/sampling_logp_difference/mean": 6.760751247406006,
"step": 1849
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.2813594937324524,
"epoch": 4.868421052631579,
"grad_norm": 371.5801086425781,
"learning_rate": 1e-06,
"loss": 0.1545,
"step": 1850
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.25378528982400894,
"epoch": 4.871052631578947,
"grad_norm": 1.993329405784607,
"learning_rate": 1e-06,
"loss": 0.2296,
"step": 1851
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.2722862958908081,
"epoch": 4.873684210526315,
"grad_norm": 0.014913872815668583,
"learning_rate": 1e-06,
"loss": 0.1323,
"step": 1852
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.119140625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3017.0,
"completions/mean_length": 2810.728515625,
"completions/mean_terminated_length": 974.8757934570312,
"completions/min_length": 298.0,
"completions/min_terminated_length": 298.0,
"entropy": 0.2696688771247864,
"epoch": 4.876315789473685,
"frac_reward_zero_std": 0.34375,
"grad_norm": 104.42438507080078,
"learning_rate": 1e-06,
"loss": 0.0423,
"num_tokens": 605653183.0,
"reward": 0.7709691524505615,
"reward_std": 0.15861886739730835,
"rewards/progression_diversity/mean": -0.03199107199907303,
"rewards/progression_diversity/std": 0.09810096770524979,
"rewards/symbolic_reward_accuracy/mean": 0.85546875,
"rewards/symbolic_reward_accuracy/std": 0.35197147727012634,
"rewards/symbolic_reward_partial_score/mean": 0.8893228769302368,
"rewards/symbolic_reward_partial_score/std": 0.2945672869682312,
"rewards/tag_count_reward/mean": -0.087890625,
"rewards/tag_count_reward/std": 0.2834126651287079,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0273919105529785,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 400.0,
"sampling/sampling_logp_difference/mean": 5.841981887817383,
"step": 1853
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.25676195323467255,
"epoch": 4.878947368421053,
"grad_norm": 27.663148880004883,
"learning_rate": 1e-06,
"loss": 0.1787,
"step": 1854
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.1796875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.23987554013729095,
"epoch": 4.881578947368421,
"grad_norm": 0.01363349985331297,
"learning_rate": 1e-06,
"loss": 0.2546,
"step": 1855
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.26036641001701355,
"epoch": 4.88421052631579,
"grad_norm": 0.026609908789396286,
"learning_rate": 1e-06,
"loss": 0.0951,
"step": 1856
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.091796875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3775.0,
"completions/mean_length": 2387.767578125,
"completions/mean_terminated_length": 973.0946655273438,
"completions/min_length": 302.0,
"completions/min_terminated_length": 302.0,
"entropy": 0.2557432949542999,
"epoch": 4.886842105263158,
"frac_reward_zero_std": 0.34375,
"grad_norm": 442.3005065917969,
"learning_rate": 1e-06,
"loss": 0.1959,
"num_tokens": 607298216.0,
"reward": 0.7776139974594116,
"reward_std": 0.15359753370285034,
"rewards/progression_diversity/mean": -0.03156745806336403,
"rewards/progression_diversity/std": 0.10606934875249863,
"rewards/symbolic_reward_accuracy/mean": 0.859375,
"rewards/symbolic_reward_accuracy/std": 0.3479743003845215,
"rewards/symbolic_reward_partial_score/mean": 0.8958333730697632,
"rewards/symbolic_reward_partial_score/std": 0.2811386287212372,
"rewards/tag_count_reward/mean": -0.064453125,
"rewards/tag_count_reward/std": 0.24579854309558868,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0301284790039062,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 400.0,
"sampling/sampling_logp_difference/mean": 5.681142807006836,
"step": 1857
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.2727195918560028,
"epoch": 4.889473684210526,
"grad_norm": 266.2679443359375,
"learning_rate": 1e-06,
"loss": 0.1627,
"step": 1858
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.2670164704322815,
"epoch": 4.8921052631578945,
"grad_norm": 0.010746384039521217,
"learning_rate": 1e-06,
"loss": 0.1468,
"step": 1859
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.27139565348625183,
"epoch": 4.894736842105263,
"grad_norm": 0.0044360412284731865,
"learning_rate": 1e-06,
"loss": 0.1136,
"step": 1860
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3370.0,
"completions/mean_length": 1420.21484375,
"completions/mean_terminated_length": 811.9308471679688,
"completions/min_length": 279.0,
"completions/min_terminated_length": 279.0,
"entropy": 0.2882204055786133,
"epoch": 4.897368421052631,
"frac_reward_zero_std": 0.5,
"grad_norm": 773.8041381835938,
"learning_rate": 1e-06,
"loss": 0.1389,
"num_tokens": 608393302.0,
"reward": 0.8496336936950684,
"reward_std": 0.10768449306488037,
"rewards/progression_diversity/mean": -0.017098724842071533,
"rewards/progression_diversity/std": 0.08937934786081314,
"rewards/symbolic_reward_accuracy/mean": 0.939453125,
"rewards/symbolic_reward_accuracy/std": 0.2387305200099945,
"rewards/symbolic_reward_partial_score/mean": 0.9635416269302368,
"rewards/symbolic_reward_partial_score/std": 0.16535688936710358,
"rewards/tag_count_reward/mean": -0.029296875,
"rewards/tag_count_reward/std": 0.16880230605602264,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0443835258483887,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 402.0,
"sampling/sampling_logp_difference/mean": 3.879685878753662,
"step": 1861
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.30971573293209076,
"epoch": 4.9,
"grad_norm": 0.017872009426355362,
"learning_rate": 1e-06,
"loss": 0.0584,
"step": 1862
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.30385591089725494,
"epoch": 4.902631578947369,
"grad_norm": 0.0137720238417387,
"learning_rate": 1e-06,
"loss": 0.0668,
"step": 1863
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.28459571301937103,
"epoch": 4.905263157894737,
"grad_norm": 0.005599349737167358,
"learning_rate": 1e-06,
"loss": 0.138,
"step": 1864
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.076171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3664.0,
"completions/mean_length": 2037.166015625,
"completions/mean_terminated_length": 854.2346801757812,
"completions/min_length": 281.0,
"completions/min_terminated_length": 281.0,
"entropy": 0.29497842490673065,
"epoch": 4.907894736842105,
"frac_reward_zero_std": 0.46875,
"grad_norm": 3.637153148651123,
"learning_rate": 1e-06,
"loss": 0.0049,
"num_tokens": 609837131.0,
"reward": 0.8191713094711304,
"reward_std": 0.12532027065753937,
"rewards/progression_diversity/mean": -0.03111710585653782,
"rewards/progression_diversity/std": 0.11476296186447144,
"rewards/symbolic_reward_accuracy/mean": 0.91015625,
"rewards/symbolic_reward_accuracy/std": 0.2862374484539032,
"rewards/symbolic_reward_partial_score/mean": 0.9334309697151184,
"rewards/symbolic_reward_partial_score/std": 0.229754239320755,
"rewards/tag_count_reward/mean": -0.06640625,
"rewards/tag_count_reward/std": 0.2492343932390213,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0394989252090454,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 404.0,
"sampling/sampling_logp_difference/mean": 4.205488204956055,
"step": 1865
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.28409726917743683,
"epoch": 4.910526315789474,
"grad_norm": 175.42616271972656,
"learning_rate": 1e-06,
"loss": 0.1474,
"step": 1866
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.2551625221967697,
"epoch": 4.913157894736842,
"grad_norm": 11.765541076660156,
"learning_rate": 1e-06,
"loss": 0.2086,
"step": 1867
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.2779727131128311,
"epoch": 4.91578947368421,
"grad_norm": 0.00957444030791521,
"learning_rate": 1e-06,
"loss": 0.1011,
"step": 1868
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.099609375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3578.0,
"completions/mean_length": 2560.607421875,
"completions/mean_terminated_length": 1031.33837890625,
"completions/min_length": 282.0,
"completions/min_terminated_length": 282.0,
"entropy": 0.26823192834854126,
"epoch": 4.918421052631579,
"frac_reward_zero_std": 0.28125,
"grad_norm": 1057.808837890625,
"learning_rate": 1e-06,
"loss": 0.1326,
"num_tokens": 611581346.0,
"reward": 0.7715161442756653,
"reward_std": 0.2120749056339264,
"rewards/progression_diversity/mean": -0.04077008739113808,
"rewards/progression_diversity/std": 0.13221383094787598,
"rewards/symbolic_reward_accuracy/mean": 0.8515625,
"rewards/symbolic_reward_accuracy/std": 0.35588082671165466,
"rewards/symbolic_reward_partial_score/mean": 0.8992512822151184,
"rewards/symbolic_reward_partial_score/std": 0.27642568945884705,
"rewards/tag_count_reward/mean": -0.087890625,
"rewards/tag_count_reward/std": 0.2834126651287079,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0262513160705566,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 404.0,
"sampling/sampling_logp_difference/mean": 5.402329444885254,
"step": 1869
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.27585209906101227,
"epoch": 4.921052631578947,
"grad_norm": 338.9614562988281,
"learning_rate": 1e-06,
"loss": 0.1617,
"step": 1870
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.26490674912929535,
"epoch": 4.923684210526316,
"grad_norm": 0.07172678411006927,
"learning_rate": 1e-06,
"loss": 0.1314,
"step": 1871
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.453125,
"entropy": 0.2522326856851578,
"epoch": 4.926315789473684,
"grad_norm": 0.006790068931877613,
"learning_rate": 1e-06,
"loss": 0.2023,
"step": 1872
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.064453125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3835.0,
"completions/mean_length": 1870.408203125,
"completions/mean_terminated_length": 870.5156860351562,
"completions/min_length": 283.0,
"completions/min_terminated_length": 283.0,
"entropy": 0.29680676758289337,
"epoch": 4.928947368421053,
"frac_reward_zero_std": 0.4375,
"grad_norm": 459.56561279296875,
"learning_rate": 1e-06,
"loss": 0.0501,
"num_tokens": 612917779.0,
"reward": 0.8070130944252014,
"reward_std": 0.12357471138238907,
"rewards/progression_diversity/mean": -0.02623012289404869,
"rewards/progression_diversity/std": 0.10527035593986511,
"rewards/symbolic_reward_accuracy/mean": 0.892578125,
"rewards/symbolic_reward_accuracy/std": 0.30995169281959534,
"rewards/symbolic_reward_partial_score/mean": 0.9239909052848816,
"rewards/symbolic_reward_partial_score/std": 0.24311135709285736,
"rewards/tag_count_reward/mean": -0.0546875,
"rewards/tag_count_reward/std": 0.2275916188955307,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0434627532958984,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 408.0,
"sampling/sampling_logp_difference/mean": 3.528529167175293,
"step": 1873
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.296875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.28874650597572327,
"epoch": 4.931578947368421,
"grad_norm": 229.14480590820312,
"learning_rate": 1e-06,
"loss": 0.1112,
"step": 1874
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.28308890759944916,
"epoch": 4.934210526315789,
"grad_norm": 0.023203283548355103,
"learning_rate": 1e-06,
"loss": 0.1112,
"step": 1875
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.2699621021747589,
"epoch": 4.936842105263158,
"grad_norm": 0.0074362908490002155,
"learning_rate": 1e-06,
"loss": 0.1392,
"step": 1876
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.068359375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3640.0,
"completions/mean_length": 2017.177734375,
"completions/mean_terminated_length": 963.0083618164062,
"completions/min_length": 336.0,
"completions/min_terminated_length": 336.0,
"entropy": 0.2770151048898697,
"epoch": 4.939473684210526,
"frac_reward_zero_std": 0.4375,
"grad_norm": 661.4608154296875,
"learning_rate": 1e-06,
"loss": 0.1113,
"num_tokens": 614368398.0,
"reward": 0.8179945945739746,
"reward_std": 0.1273220181465149,
"rewards/progression_diversity/mean": -0.03159845247864723,
"rewards/progression_diversity/std": 0.12263655662536621,
"rewards/symbolic_reward_accuracy/mean": 0.90234375,
"rewards/symbolic_reward_accuracy/std": 0.29713961482048035,
"rewards/symbolic_reward_partial_score/mean": 0.9444986581802368,
"rewards/symbolic_reward_partial_score/std": 0.2035871148109436,
"rewards/tag_count_reward/mean": -0.064453125,
"rewards/tag_count_reward/std": 0.24579854309558868,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0411176681518555,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 408.0,
"sampling/sampling_logp_difference/mean": 3.9134116172790527,
"step": 1877
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.2913369834423065,
"epoch": 4.942105263157895,
"grad_norm": 0.01320668775588274,
"learning_rate": 1e-06,
"loss": 0.0404,
"step": 1878
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.28409533202648163,
"epoch": 4.9447368421052635,
"grad_norm": 0.03402348980307579,
"learning_rate": 1e-06,
"loss": 0.1155,
"step": 1879
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.2711239904165268,
"epoch": 4.947368421052632,
"grad_norm": 0.008342397399246693,
"learning_rate": 1e-06,
"loss": 0.1331,
"step": 1880
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.056640625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3740.0,
"completions/mean_length": 1834.041015625,
"completions/mean_terminated_length": 960.4410400390625,
"completions/min_length": 289.0,
"completions/min_terminated_length": 289.0,
"entropy": 0.2792969048023224,
"epoch": 4.95,
"frac_reward_zero_std": 0.3125,
"grad_norm": 335.7244873046875,
"learning_rate": 1e-06,
"loss": 0.125,
"num_tokens": 615720259.0,
"reward": 0.842618465423584,
"reward_std": 0.1381361484527588,
"rewards/progression_diversity/mean": -0.02526368759572506,
"rewards/progression_diversity/std": 0.10902338474988937,
"rewards/symbolic_reward_accuracy/mean": 0.93359375,
"rewards/symbolic_reward_accuracy/std": 0.2492343932390213,
"rewards/symbolic_reward_partial_score/mean": 0.9593098759651184,
"rewards/symbolic_reward_partial_score/std": 0.18112175166606903,
"rewards/tag_count_reward/mean": -0.05078125,
"rewards/tag_count_reward/std": 0.21976542472839355,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0347847938537598,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 410.0,
"sampling/sampling_logp_difference/mean": 4.509306907653809,
"step": 1881
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.2655886858701706,
"epoch": 4.9526315789473685,
"grad_norm": 0.01917690970003605,
"learning_rate": 1e-06,
"loss": 0.1565,
"step": 1882
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.2937444746494293,
"epoch": 4.955263157894737,
"grad_norm": 0.027145760133862495,
"learning_rate": 1e-06,
"loss": 0.0847,
"step": 1883
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.2872210890054703,
"epoch": 4.957894736842105,
"grad_norm": 0.007199451327323914,
"learning_rate": 1e-06,
"loss": 0.1322,
"step": 1884
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.076171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4932.0,
"completions/mean_length": 2128.708984375,
"completions/mean_terminated_length": 953.3255615234375,
"completions/min_length": 268.0,
"completions/min_terminated_length": 268.0,
"entropy": 0.2776035964488983,
"epoch": 4.9605263157894735,
"frac_reward_zero_std": 0.46875,
"grad_norm": 354.8702392578125,
"learning_rate": 1e-06,
"loss": 0.0456,
"num_tokens": 617215790.0,
"reward": 0.7808091640472412,
"reward_std": 0.14809326827526093,
"rewards/progression_diversity/mean": -0.029436133801937103,
"rewards/progression_diversity/std": 0.1100505068898201,
"rewards/symbolic_reward_accuracy/mean": 0.861328125,
"rewards/symbolic_reward_accuracy/std": 0.34594178199768066,
"rewards/symbolic_reward_partial_score/mean": 0.9044596552848816,
"rewards/symbolic_reward_partial_score/std": 0.26103639602661133,
"rewards/tag_count_reward/mean": -0.0703125,
"rewards/tag_count_reward/std": 0.25592297315597534,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.038970708847046,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 410.0,
"sampling/sampling_logp_difference/mean": 4.3476243019104,
"step": 1885
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.2802853584289551,
"epoch": 4.963157894736842,
"grad_norm": 0.026210030540823936,
"learning_rate": 1e-06,
"loss": 0.1114,
"step": 1886
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.292650431394577,
"epoch": 4.965789473684211,
"grad_norm": 0.017857909202575684,
"learning_rate": 1e-06,
"loss": 0.0637,
"step": 1887
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.258012056350708,
"epoch": 4.968421052631579,
"grad_norm": 0.031659141182899475,
"learning_rate": 1e-06,
"loss": 0.1985,
"step": 1888
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.052734375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 4051.0,
"completions/mean_length": 1877.359375,
"completions/mean_terminated_length": 1069.773193359375,
"completions/min_length": 267.0,
"completions/min_terminated_length": 267.0,
"entropy": 0.28717607259750366,
"epoch": 4.971052631578948,
"frac_reward_zero_std": 0.34375,
"grad_norm": 143.55223083496094,
"learning_rate": 1e-06,
"loss": 0.0625,
"num_tokens": 618611846.0,
"reward": 0.8242976665496826,
"reward_std": 0.13553351163864136,
"rewards/progression_diversity/mean": -0.021410608664155006,
"rewards/progression_diversity/std": 0.09337057173252106,
"rewards/symbolic_reward_accuracy/mean": 0.90625,
"rewards/symbolic_reward_accuracy/std": 0.29176566004753113,
"rewards/symbolic_reward_partial_score/mean": 0.9527994990348816,
"rewards/symbolic_reward_partial_score/std": 0.17747651040554047,
"rewards/tag_count_reward/mean": -0.05078125,
"rewards/tag_count_reward/std": 0.21976542472839355,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0396955013275146,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 412.0,
"sampling/sampling_logp_difference/mean": 4.124094009399414,
"step": 1889
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.2870943546295166,
"epoch": 4.973684210526316,
"grad_norm": 0.014890086837112904,
"learning_rate": 1e-06,
"loss": 0.0991,
"step": 1890
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.27629394829273224,
"epoch": 4.976315789473684,
"grad_norm": 0.010494343005120754,
"learning_rate": 1e-06,
"loss": 0.0717,
"step": 1891
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.26271922141313553,
"epoch": 4.978947368421053,
"grad_norm": 0.014826311729848385,
"learning_rate": 1e-06,
"loss": 0.1739,
"step": 1892
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.064453125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3538.0,
"completions/mean_length": 1878.482421875,
"completions/mean_terminated_length": 879.1461791992188,
"completions/min_length": 249.0,
"completions/min_terminated_length": 249.0,
"entropy": 0.30142028629779816,
"epoch": 4.981578947368421,
"frac_reward_zero_std": 0.4375,
"grad_norm": 139.75131225585938,
"learning_rate": 1e-06,
"loss": 0.0789,
"num_tokens": 619963101.0,
"reward": 0.8057065010070801,
"reward_std": 0.13081881403923035,
"rewards/progression_diversity/mean": -0.025057487189769745,
"rewards/progression_diversity/std": 0.10004720836877823,
"rewards/symbolic_reward_accuracy/mean": 0.88671875,
"rewards/symbolic_reward_accuracy/std": 0.3172462284564972,
"rewards/symbolic_reward_partial_score/mean": 0.9339193105697632,
"rewards/symbolic_reward_partial_score/std": 0.21596600115299225,
"rewards/tag_count_reward/mean": -0.0625,
"rewards/tag_count_reward/std": 0.2422981858253479,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0438662767410278,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 412.0,
"sampling/sampling_logp_difference/mean": 4.115793228149414,
"step": 1893
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.28368599712848663,
"epoch": 4.984210526315789,
"grad_norm": 0.008541757240891457,
"learning_rate": 1e-06,
"loss": 0.1416,
"step": 1894
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.2943654954433441,
"epoch": 4.9868421052631575,
"grad_norm": 0.007597525604069233,
"learning_rate": 1e-06,
"loss": 0.0159,
"step": 1895
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.28492121398448944,
"epoch": 4.989473684210527,
"grad_norm": 0.008518518880009651,
"learning_rate": 1e-06,
"loss": 0.0887,
"step": 1896
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3774.0,
"completions/mean_length": 1736.919921875,
"completions/mean_terminated_length": 889.568115234375,
"completions/min_length": 236.0,
"completions/min_terminated_length": 236.0,
"entropy": 0.30898527801036835,
"epoch": 4.992105263157895,
"frac_reward_zero_std": 0.53125,
"grad_norm": 330.7555847167969,
"learning_rate": 1e-06,
"loss": 0.0491,
"num_tokens": 621237364.0,
"reward": 0.8030959963798523,
"reward_std": 0.13730832934379578,
"rewards/progression_diversity/mean": -0.01755291409790516,
"rewards/progression_diversity/std": 0.07998932898044586,
"rewards/symbolic_reward_accuracy/mean": 0.8828125,
"rewards/symbolic_reward_accuracy/std": 0.32195815443992615,
"rewards/symbolic_reward_partial_score/mean": 0.9288737177848816,
"rewards/symbolic_reward_partial_score/std": 0.22742776572704315,
"rewards/tag_count_reward/mean": -0.05078125,
"rewards/tag_count_reward/std": 0.21976542472839355,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0468072891235352,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 412.0,
"sampling/sampling_logp_difference/mean": 3.8619487285614014,
"step": 1897
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.29312750697135925,
"epoch": 4.994736842105263,
"grad_norm": 0.026777606457471848,
"learning_rate": 1e-06,
"loss": 0.1218,
"step": 1898
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.30650024116039276,
"epoch": 4.997368421052632,
"grad_norm": 0.006984531879425049,
"learning_rate": 1e-06,
"loss": 0.0475,
"step": 1899
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.295014426112175,
"epoch": 5.0,
"grad_norm": 0.007008237764239311,
"learning_rate": 1e-06,
"loss": 0.1042,
"step": 1900
},
{
"epoch": 5.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.015380859375,
"eval_completions/max_length": 13712.375,
"eval_completions/max_terminated_length": 2437.9375,
"eval_completions/mean_length": 905.335693359375,
"eval_completions/mean_terminated_length": 664.4008159637451,
"eval_completions/min_length": 251.21875,
"eval_completions/min_terminated_length": 251.21875,
"eval_entropy": 0.31546804029494524,
"eval_frac_reward_zero_std": 0.59765625,
"eval_loss": 0.018208162859082222,
"eval_num_tokens": 621237364.0,
"eval_reward": 0.8710558321326971,
"eval_reward_std": 0.08327648929116549,
"eval_rewards/progression_diversity/mean": -0.00623641712081735,
"eval_rewards/progression_diversity/std": 0.04451105743646622,
"eval_rewards/symbolic_reward_accuracy/mean": 0.964111328125,
"eval_rewards/symbolic_reward_accuracy/std": 0.1680670971982181,
"eval_rewards/symbolic_reward_partial_score/mean": 0.9798990897834301,
"eval_rewards/symbolic_reward_partial_score/std": 0.10754719079704955,
"eval_rewards/tag_count_reward/mean": -0.01318359375,
"eval_rewards/tag_count_reward/std": 0.09598715417087078,
"eval_runtime": 3296.8572,
"eval_samples_per_second": 0.076,
"eval_sampling/importance_sampling_ratio/max": 2.0,
"eval_sampling/importance_sampling_ratio/mean": 1.0692463777959347,
"eval_sampling/importance_sampling_ratio/min": 7.846449850264574e-05,
"eval_sampling/sampling_logp_difference/max": 337.0917568653822,
"eval_sampling/sampling_logp_difference/mean": 0.9625979592092335,
"eval_steps_per_second": 0.001,
"step": 1900
},
{
"epoch": 5.0,
"step": 1900,
"total_flos": 0.0,
"train_loss": 0.10312542560923918,
"train_runtime": 114002.2359,
"train_samples_per_second": 0.134,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1,
"max_steps": 1900,
"num_input_tokens_seen": 621237364,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}