Files
RLCR-math-3B/trainer_state.json
ModelHub XC c776969f82 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/RLCR-math-3B
Source: Original Platform
2026-05-04 23:15:58 +08:00

1397 lines
53 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4992,
"eval_steps": 50,
"global_step": 234,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04873046875,
"completions/max_length": 3832.0,
"completions/max_terminated_length": 3832.0,
"completions/mean_length": 417.21298828125,
"completions/mean_terminated_length": 438.694482421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.0022994126193225384,
"learning_rate": 5.319148936170213e-07,
"loss": -0.0106,
"num_tokens": 7050533.0,
"reward": 0.2718377888202667,
"reward_std": 0.3972369432449341,
"rewards/accuracy_reward": 0.0904296875,
"rewards/brier_reward": 0.11152872890233993,
"rewards/confidence_one_or_zero": 0.380078125,
"rewards/format_reward": 0.34169921875,
"rewards/mean_confidence_reward": 0.5165613055229187,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.047265625,
"completions/max_length": 3725.8,
"completions/max_terminated_length": 3725.8,
"completions/mean_length": 396.103125,
"completions/mean_terminated_length": 415.90752563476565,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.002011506352573633,
"learning_rate": 1.0638297872340427e-06,
"loss": -0.0032,
"num_tokens": 13869093.0,
"reward": 0.36579355001449587,
"reward_std": 0.4264163374900818,
"rewards/accuracy_reward": 0.119921875,
"rewards/brier_reward": 0.14758132696151732,
"rewards/confidence_one_or_zero": 0.4490234375,
"rewards/format_reward": 0.4640625,
"rewards/mean_confidence_reward": 0.6210533976554871,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0349609375,
"completions/max_length": 3780.8,
"completions/max_terminated_length": 3780.8,
"completions/mean_length": 342.1515625,
"completions/mean_terminated_length": 354.5928588867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 8.8,
"epoch": 0.032,
"grad_norm": 0.0011837292695418,
"learning_rate": 1.595744680851064e-06,
"loss": -0.0015,
"num_tokens": 20090005.0,
"reward": 0.5768917322158813,
"reward_std": 0.42254899740219115,
"rewards/accuracy_reward": 0.17490234375,
"rewards/brier_reward": 0.2182064473628998,
"rewards/confidence_one_or_zero": 0.6333984375,
"rewards/format_reward": 0.76064453125,
"rewards/mean_confidence_reward": 0.8744643926620483,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03291015625,
"completions/max_length": 3326.2,
"completions/max_terminated_length": 3326.2,
"completions/mean_length": 300.54345703125,
"completions/mean_terminated_length": 310.7569152832031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 36.6,
"epoch": 0.042666666666666665,
"grad_norm": 0.005445198621600866,
"learning_rate": 2.1276595744680853e-06,
"loss": -0.0134,
"num_tokens": 25946610.0,
"reward": 0.6536507606506348,
"reward_std": 0.3477814972400665,
"rewards/accuracy_reward": 0.167578125,
"rewards/brier_reward": 0.2340293437242508,
"rewards/confidence_one_or_zero": 0.5921875,
"rewards/format_reward": 0.9056640625,
"rewards/mean_confidence_reward": 0.906139588356018,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0265625,
"completions/max_length": 3896.2,
"completions/max_terminated_length": 3896.2,
"completions/mean_length": 271.092578125,
"completions/mean_terminated_length": 278.50535888671874,
"completions/min_length": 0.0,
"completions/min_terminated_length": 45.2,
"epoch": 0.05333333333333334,
"grad_norm": 0.0007234986987896264,
"learning_rate": 2.6595744680851065e-06,
"loss": -0.019,
"num_tokens": 31458182.0,
"reward": 0.7525559663772583,
"reward_std": 0.3314167380332947,
"rewards/accuracy_reward": 0.2111328125,
"rewards/brier_reward": 0.3317470133304596,
"rewards/confidence_one_or_zero": 0.376171875,
"rewards/format_reward": 0.96220703125,
"rewards/mean_confidence_reward": 0.8775923013687134,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.019921875,
"completions/max_length": 3383.8,
"completions/max_terminated_length": 3383.8,
"completions/mean_length": 309.6873046875,
"completions/mean_terminated_length": 315.9191589355469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 48.2,
"epoch": 0.064,
"grad_norm": 0.0004548918514046818,
"learning_rate": 3.191489361702128e-06,
"loss": -0.0139,
"num_tokens": 37443556.0,
"reward": 0.8603755593299866,
"reward_std": 0.3160943269729614,
"rewards/accuracy_reward": 0.25341796875,
"rewards/brier_reward": 0.4920231521129608,
"rewards/confidence_one_or_zero": 0.08056640625,
"rewards/format_reward": 0.97529296875,
"rewards/mean_confidence_reward": 0.7704261898994446,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0197265625,
"completions/max_length": 3442.8,
"completions/max_terminated_length": 3442.8,
"completions/mean_length": 341.35791015625,
"completions/mean_terminated_length": 348.3955383300781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 87.8,
"epoch": 0.07466666666666667,
"grad_norm": 0.00027860456611961126,
"learning_rate": 3.723404255319149e-06,
"loss": -0.0176,
"num_tokens": 43686581.0,
"reward": 0.9539328455924988,
"reward_std": 0.25055329501628876,
"rewards/accuracy_reward": 0.2630859375,
"rewards/brier_reward": 0.6670343160629273,
"rewards/confidence_one_or_zero": 0.0181640625,
"rewards/format_reward": 0.977734375,
"rewards/mean_confidence_reward": 0.5359978020191193,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02119140625,
"completions/max_length": 3646.2,
"completions/max_terminated_length": 3646.2,
"completions/mean_length": 370.0533203125,
"completions/mean_terminated_length": 378.1131530761719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 87.8,
"epoch": 0.08533333333333333,
"grad_norm": 0.00019421246543060988,
"learning_rate": 4.255319148936171e-06,
"loss": -0.0236,
"num_tokens": 50196599.0,
"reward": 1.005708146095276,
"reward_std": 0.19348926544189454,
"rewards/accuracy_reward": 0.30771484375,
"rewards/brier_reward": 0.7264483809471131,
"rewards/confidence_one_or_zero": 0.02001953125,
"rewards/format_reward": 0.97724609375,
"rewards/mean_confidence_reward": 0.32700340151786805,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0263671875,
"completions/max_length": 3611.4,
"completions/max_terminated_length": 3611.4,
"completions/mean_length": 419.93203125,
"completions/mean_terminated_length": 431.3776428222656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 106.8,
"epoch": 0.096,
"grad_norm": 0.00017933457274921238,
"learning_rate": 4.787234042553192e-06,
"loss": -0.0264,
"num_tokens": 57285631.0,
"reward": 1.0260677814483643,
"reward_std": 0.17603319883346558,
"rewards/accuracy_reward": 0.37861328125,
"rewards/brier_reward": 0.701446795463562,
"rewards/confidence_one_or_zero": 0.02587890625,
"rewards/format_reward": 0.9720703125,
"rewards/mean_confidence_reward": 0.23751758635044098,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.031640625,
"completions/max_length": 3522.6,
"completions/max_terminated_length": 3522.6,
"completions/mean_length": 460.63408203125,
"completions/mean_terminated_length": 475.79522094726565,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.2,
"epoch": 0.10666666666666667,
"grad_norm": 0.00020549257169477642,
"learning_rate": 4.919786096256685e-06,
"loss": -0.0343,
"num_tokens": 64789148.0,
"reward": 1.0377721786499023,
"reward_std": 0.19848274290561677,
"rewards/accuracy_reward": 0.403125,
"rewards/brier_reward": 0.7054211854934692,
"rewards/confidence_one_or_zero": 0.016796875,
"rewards/format_reward": 0.9669921875,
"rewards/mean_confidence_reward": 0.28423587083816526,
"step": 50
},
{
"epoch": 0.10666666666666667,
"eval_completions/clipped_ratio": 0.033203125,
"eval_completions/max_length": 1511.6875,
"eval_completions/max_terminated_length": 1511.6875,
"eval_completions/mean_length": 481.26543045043945,
"eval_completions/mean_terminated_length": 497.91385078430176,
"eval_completions/min_length": 30.5,
"eval_completions/min_terminated_length": 152.3125,
"eval_loss": 0.0,
"eval_num_tokens": 64789148.0,
"eval_reward": 1.0428522787988186,
"eval_reward_std": 0.2570919021964073,
"eval_rewards/accuracy_reward": 0.41015625,
"eval_rewards/brier_reward": 0.7097212113440037,
"eval_rewards/confidence_one_or_zero": 0.021484375,
"eval_rewards/format_reward": 0.9658203125,
"eval_rewards/mean_confidence_reward": 0.31723164208233356,
"eval_runtime": 330.2735,
"eval_samples_per_second": 3.028,
"eval_steps_per_second": 0.048,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.037109375,
"completions/max_length": 3745.6,
"completions/max_terminated_length": 3745.6,
"completions/mean_length": 499.6552734375,
"completions/mean_terminated_length": 518.9554016113282,
"completions/min_length": 0.0,
"completions/min_terminated_length": 102.6,
"epoch": 0.11733333333333333,
"grad_norm": 0.00023679890728089958,
"learning_rate": 4.786096256684493e-06,
"loss": -0.0424,
"num_tokens": 72641586.0,
"reward": 1.0429035425186157,
"reward_std": 0.2101288139820099,
"rewards/accuracy_reward": 0.41943359375,
"rewards/brier_reward": 0.7049405694007873,
"rewards/confidence_one_or_zero": 0.01376953125,
"rewards/format_reward": 0.96142578125,
"rewards/mean_confidence_reward": 0.34695433378219603,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03681640625,
"completions/max_length": 3240.2,
"completions/max_terminated_length": 3240.2,
"completions/mean_length": 516.13466796875,
"completions/mean_terminated_length": 535.995068359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.2,
"epoch": 0.128,
"grad_norm": 0.00020350873819552362,
"learning_rate": 4.6524064171123e-06,
"loss": -0.0377,
"num_tokens": 80683029.0,
"reward": 1.0565701007843018,
"reward_std": 0.21977659463882446,
"rewards/accuracy_reward": 0.4330078125,
"rewards/brier_reward": 0.718209958076477,
"rewards/confidence_one_or_zero": 0.01044921875,
"rewards/format_reward": 0.9619140625,
"rewards/mean_confidence_reward": 0.40334631204605104,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03916015625,
"completions/max_length": 3614.2,
"completions/max_terminated_length": 3614.2,
"completions/mean_length": 548.8623046875,
"completions/mean_terminated_length": 571.418896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 146.8,
"epoch": 0.13866666666666666,
"grad_norm": 0.00023808285186532885,
"learning_rate": 4.518716577540107e-06,
"loss": -0.0388,
"num_tokens": 89039955.0,
"reward": 1.0643263101577758,
"reward_std": 0.22522514760494233,
"rewards/accuracy_reward": 0.45087890625,
"rewards/brier_reward": 0.718096923828125,
"rewards/confidence_one_or_zero": 0.00546875,
"rewards/format_reward": 0.95966796875,
"rewards/mean_confidence_reward": 0.4328162491321564,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04814453125,
"completions/max_length": 3548.4,
"completions/max_terminated_length": 3548.4,
"completions/mean_length": 565.52802734375,
"completions/mean_terminated_length": 594.6517456054687,
"completions/min_length": 0.0,
"completions/min_terminated_length": 121.8,
"epoch": 0.14933333333333335,
"grad_norm": 0.003085868200287223,
"learning_rate": 4.385026737967915e-06,
"loss": -0.046,
"num_tokens": 97599986.0,
"reward": 1.06097412109375,
"reward_std": 0.23501123189926149,
"rewards/accuracy_reward": 0.46328125,
"rewards/brier_reward": 0.7087554454803466,
"rewards/confidence_one_or_zero": 0.002734375,
"rewards/format_reward": 0.94990234375,
"rewards/mean_confidence_reward": 0.4535819053649902,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03466796875,
"completions/max_length": 3784.8,
"completions/max_terminated_length": 3784.8,
"completions/mean_length": 603.82158203125,
"completions/mean_terminated_length": 625.44541015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.16,
"grad_norm": 0.0002215144777437672,
"learning_rate": 4.251336898395722e-06,
"loss": -0.0343,
"num_tokens": 106515951.0,
"reward": 1.0931342601776124,
"reward_std": 0.21771803498268127,
"rewards/accuracy_reward": 0.50087890625,
"rewards/brier_reward": 0.7211225032806396,
"rewards/confidence_one_or_zero": 0.00166015625,
"rewards/format_reward": 0.9642578125,
"rewards/mean_confidence_reward": 0.46166038513183594,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03857421875,
"completions/max_length": 3760.2,
"completions/max_terminated_length": 3760.2,
"completions/mean_length": 598.0275390625,
"completions/mean_terminated_length": 622.232177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.4,
"epoch": 0.17066666666666666,
"grad_norm": 0.0002537222462706268,
"learning_rate": 4.11764705882353e-06,
"loss": -0.0398,
"num_tokens": 115384169.0,
"reward": 1.0766690015792846,
"reward_std": 0.23046172559261321,
"rewards/accuracy_reward": 0.47578125,
"rewards/brier_reward": 0.7174887418746948,
"rewards/confidence_one_or_zero": 0.0009765625,
"rewards/format_reward": 0.96005859375,
"rewards/mean_confidence_reward": 0.46985026001930236,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03154296875,
"completions/max_length": 3239.2,
"completions/max_terminated_length": 3239.2,
"completions/mean_length": 640.25078125,
"completions/mean_terminated_length": 661.4207153320312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.6,
"epoch": 0.18133333333333335,
"grad_norm": 0.0002039925311692059,
"learning_rate": 3.983957219251337e-06,
"loss": -0.0322,
"num_tokens": 124663409.0,
"reward": 1.111165952682495,
"reward_std": 0.21703881323337554,
"rewards/accuracy_reward": 0.53037109375,
"rewards/brier_reward": 0.7248614072799683,
"rewards/confidence_one_or_zero": 0.00087890625,
"rewards/format_reward": 0.96708984375,
"rewards/mean_confidence_reward": 0.47759194374084474,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0322265625,
"completions/max_length": 3809.0,
"completions/max_terminated_length": 3809.0,
"completions/mean_length": 674.80703125,
"completions/mean_terminated_length": 697.274072265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.8,
"epoch": 0.192,
"grad_norm": 0.0002259796456201002,
"learning_rate": 3.850267379679145e-06,
"loss": -0.0323,
"num_tokens": 134289081.0,
"reward": 1.1023191452026366,
"reward_std": 0.21436747312545776,
"rewards/accuracy_reward": 0.5142578125,
"rewards/brier_reward": 0.7240622282028198,
"rewards/confidence_one_or_zero": 0.0001953125,
"rewards/format_reward": 0.96630859375,
"rewards/mean_confidence_reward": 0.4761265993118286,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.034375,
"completions/max_length": 3722.0,
"completions/max_terminated_length": 3722.0,
"completions/mean_length": 695.24814453125,
"completions/mean_terminated_length": 720.0158325195313,
"completions/min_length": 0.0,
"completions/min_terminated_length": 201.2,
"epoch": 0.20266666666666666,
"grad_norm": 0.0002535897074267268,
"learning_rate": 3.716577540106952e-06,
"loss": -0.0348,
"num_tokens": 144155206.0,
"reward": 1.1052819728851317,
"reward_std": 0.22034441232681273,
"rewards/accuracy_reward": 0.52578125,
"rewards/brier_reward": 0.721687114238739,
"rewards/confidence_one_or_zero": 9.765625e-05,
"rewards/format_reward": 0.9630859375,
"rewards/mean_confidence_reward": 0.4807871162891388,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03681640625,
"completions/max_length": 3776.0,
"completions/max_terminated_length": 3776.0,
"completions/mean_length": 714.55859375,
"completions/mean_terminated_length": 742.0417602539062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.00020959587709512562,
"learning_rate": 3.5828877005347597e-06,
"loss": -0.0367,
"num_tokens": 154205790.0,
"reward": 1.1090866804122925,
"reward_std": 0.21838183403015138,
"rewards/accuracy_reward": 0.53486328125,
"rewards/brier_reward": 0.7215817093849182,
"rewards/confidence_one_or_zero": 9.765625e-05,
"rewards/format_reward": 0.96171875,
"rewards/mean_confidence_reward": 0.48115434050559996,
"step": 100
},
{
"epoch": 0.21333333333333335,
"eval_completions/clipped_ratio": 0.0322265625,
"eval_completions/max_length": 2135.5,
"eval_completions/max_terminated_length": 2135.5,
"eval_completions/mean_length": 729.1765632629395,
"eval_completions/mean_terminated_length": 753.6238555908203,
"eval_completions/min_length": 32.8125,
"eval_completions/min_terminated_length": 265.25,
"eval_loss": 0.0,
"eval_num_tokens": 154205790.0,
"eval_reward": 1.106356181204319,
"eval_reward_std": 0.3143840888515115,
"eval_rewards/accuracy_reward": 0.5185546875,
"eval_rewards/brier_reward": 0.7273510619997978,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.966796875,
"eval_rewards/mean_confidence_reward": 0.4822753965854645,
"eval_runtime": 345.9345,
"eval_samples_per_second": 2.891,
"eval_steps_per_second": 0.046,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02333984375,
"completions/max_length": 3680.4,
"completions/max_terminated_length": 3680.4,
"completions/mean_length": 740.38525390625,
"completions/mean_terminated_length": 758.0415649414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.2,
"epoch": 0.224,
"grad_norm": 0.000193859072169289,
"learning_rate": 3.449197860962567e-06,
"loss": -0.0224,
"num_tokens": 164522343.0,
"reward": 1.1161011457443237,
"reward_std": 0.1981082409620285,
"rewards/accuracy_reward": 0.52265625,
"rewards/brier_reward": 0.7332667350769043,
"rewards/confidence_one_or_zero": 9.765625e-05,
"rewards/format_reward": 0.97626953125,
"rewards/mean_confidence_reward": 0.4871376514434814,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0263671875,
"completions/max_length": 3624.6,
"completions/max_terminated_length": 3624.6,
"completions/mean_length": 750.2166015625,
"completions/mean_terminated_length": 770.4773681640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 206.0,
"epoch": 0.23466666666666666,
"grad_norm": 0.00020454383047763258,
"learning_rate": 3.3155080213903747e-06,
"loss": -0.022,
"num_tokens": 174955409.0,
"reward": 1.1203881740570067,
"reward_std": 0.19748010039329528,
"rewards/accuracy_reward": 0.54072265625,
"rewards/brier_reward": 0.7278758764266968,
"rewards/confidence_one_or_zero": 9.765625e-05,
"rewards/format_reward": 0.97216796875,
"rewards/mean_confidence_reward": 0.48651269674301145,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03046875,
"completions/max_length": 3376.2,
"completions/max_terminated_length": 3376.2,
"completions/mean_length": 749.41201171875,
"completions/mean_terminated_length": 772.8901977539062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 196.6,
"epoch": 0.24533333333333332,
"grad_norm": 0.0002078731486108154,
"learning_rate": 3.181818181818182e-06,
"loss": -0.0276,
"num_tokens": 185371564.0,
"reward": 1.11997492313385,
"reward_std": 0.20475648939609528,
"rewards/accuracy_reward": 0.544140625,
"rewards/brier_reward": 0.7269518375396729,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.96884765625,
"rewards/mean_confidence_reward": 0.4832145094871521,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.033203125,
"completions/max_length": 3540.6,
"completions/max_terminated_length": 3540.6,
"completions/mean_length": 756.285546875,
"completions/mean_terminated_length": 782.4071899414063,
"completions/min_length": 0.0,
"completions/min_terminated_length": 199.2,
"epoch": 0.256,
"grad_norm": 0.00019684557628352195,
"learning_rate": 3.0481283422459896e-06,
"loss": -0.0299,
"num_tokens": 195867160.0,
"reward": 1.1076245546340941,
"reward_std": 0.20969551503658296,
"rewards/accuracy_reward": 0.5251953125,
"rewards/brier_reward": 0.7241261005401611,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.96591796875,
"rewards/mean_confidence_reward": 0.48320378065109254,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0248046875,
"completions/max_length": 3629.2,
"completions/max_terminated_length": 3629.2,
"completions/mean_length": 731.28798828125,
"completions/mean_terminated_length": 749.828466796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.6,
"epoch": 0.26666666666666666,
"grad_norm": 0.00023284385679289699,
"learning_rate": 2.914438502673797e-06,
"loss": -0.024,
"num_tokens": 206106365.0,
"reward": 1.1237028598785401,
"reward_std": 0.20264139473438264,
"rewards/accuracy_reward": 0.542578125,
"rewards/brier_reward": 0.7308919787406921,
"rewards/confidence_one_or_zero": 0.00087890625,
"rewards/format_reward": 0.97392578125,
"rewards/mean_confidence_reward": 0.4858167290687561,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02783203125,
"completions/max_length": 3273.8,
"completions/max_terminated_length": 3273.8,
"completions/mean_length": 706.0017578125,
"completions/mean_terminated_length": 726.1967407226563,
"completions/min_length": 0.0,
"completions/min_terminated_length": 196.6,
"epoch": 0.2773333333333333,
"grad_norm": 0.0002099503471981734,
"learning_rate": 2.7807486631016045e-06,
"loss": -0.0268,
"num_tokens": 216081839.0,
"reward": 1.1141971349716187,
"reward_std": 0.1988514244556427,
"rewards/accuracy_reward": 0.52919921875,
"rewards/brier_reward": 0.7281892657279968,
"rewards/confidence_one_or_zero": 0.0001953125,
"rewards/format_reward": 0.97099609375,
"rewards/mean_confidence_reward": 0.4858289957046509,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02109375,
"completions/max_length": 3532.0,
"completions/max_terminated_length": 3532.0,
"completions/mean_length": 696.258984375,
"completions/mean_terminated_length": 711.3540283203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 211.4,
"epoch": 0.288,
"grad_norm": 0.00021950564405415207,
"learning_rate": 2.647058823529412e-06,
"loss": -0.0214,
"num_tokens": 225948139.0,
"reward": 1.136391854286194,
"reward_std": 0.18573434352874757,
"rewards/accuracy_reward": 0.56083984375,
"rewards/brier_reward": 0.7337113857269287,
"rewards/confidence_one_or_zero": 0.0001953125,
"rewards/format_reward": 0.97822265625,
"rewards/mean_confidence_reward": 0.4880725979804993,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01826171875,
"completions/max_length": 3008.6,
"completions/max_terminated_length": 3008.6,
"completions/mean_length": 709.566796875,
"completions/mean_terminated_length": 722.8854858398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.8,
"epoch": 0.2986666666666667,
"grad_norm": 0.0001989748125197366,
"learning_rate": 2.5133689839572194e-06,
"loss": -0.0177,
"num_tokens": 235999223.0,
"reward": 1.139591360092163,
"reward_std": 0.17391741871833802,
"rewards/accuracy_reward": 0.56337890625,
"rewards/brier_reward": 0.7346415877342224,
"rewards/confidence_one_or_zero": 0.0001953125,
"rewards/format_reward": 0.98115234375,
"rewards/mean_confidence_reward": 0.49123767018318176,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02548828125,
"completions/max_length": 3968.8,
"completions/max_terminated_length": 3968.8,
"completions/mean_length": 752.06943359375,
"completions/mean_terminated_length": 771.8078369140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.30933333333333335,
"grad_norm": 0.000300207786494866,
"learning_rate": 2.379679144385027e-06,
"loss": -0.0249,
"num_tokens": 246474910.0,
"reward": 1.1206291913986206,
"reward_std": 0.19275815784931183,
"rewards/accuracy_reward": 0.53828125,
"rewards/brier_reward": 0.7297251224517822,
"rewards/confidence_one_or_zero": 0.00126953125,
"rewards/format_reward": 0.9732421875,
"rewards/mean_confidence_reward": 0.48650553822517395,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02158203125,
"completions/max_length": 3529.4,
"completions/max_terminated_length": 3529.4,
"completions/mean_length": 745.94892578125,
"completions/mean_terminated_length": 762.421044921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.4,
"epoch": 0.32,
"grad_norm": 0.00023590361524838954,
"learning_rate": 2.2459893048128343e-06,
"loss": -0.0204,
"num_tokens": 256861427.0,
"reward": 1.1211992502212524,
"reward_std": 0.19113921225070954,
"rewards/accuracy_reward": 0.5330078125,
"rewards/brier_reward": 0.7318416714668274,
"rewards/confidence_one_or_zero": 9.765625e-05,
"rewards/format_reward": 0.9775390625,
"rewards/mean_confidence_reward": 0.4905679523944855,
"step": 150
},
{
"epoch": 0.32,
"eval_completions/clipped_ratio": 0.015625,
"eval_completions/max_length": 1815.6875,
"eval_completions/max_terminated_length": 1815.6875,
"eval_completions/mean_length": 742.1876945495605,
"eval_completions/mean_terminated_length": 754.273509979248,
"eval_completions/min_length": 137.6875,
"eval_completions/min_terminated_length": 270.0,
"eval_loss": 0.0,
"eval_num_tokens": 256861427.0,
"eval_reward": 1.1376136094331741,
"eval_reward_std": 0.2811641450971365,
"eval_rewards/accuracy_reward": 0.552734375,
"eval_rewards/brier_reward": 0.7390844747424126,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.9833984375,
"eval_rewards/mean_confidence_reward": 0.49321289360523224,
"eval_runtime": 284.0103,
"eval_samples_per_second": 3.521,
"eval_steps_per_second": 0.056,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01787109375,
"completions/max_length": 3623.0,
"completions/max_terminated_length": 3623.0,
"completions/mean_length": 724.40556640625,
"completions/mean_terminated_length": 737.5683471679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.8,
"epoch": 0.33066666666666666,
"grad_norm": 0.0002188891521655023,
"learning_rate": 2.112299465240642e-06,
"loss": -0.0164,
"num_tokens": 267030892.0,
"reward": 1.133889079093933,
"reward_std": 0.17375424206256868,
"rewards/accuracy_reward": 0.55048828125,
"rewards/brier_reward": 0.7355417370796203,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98173828125,
"rewards/mean_confidence_reward": 0.4923380553722382,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01875,
"completions/max_length": 3686.2,
"completions/max_terminated_length": 3686.2,
"completions/mean_length": 724.2431640625,
"completions/mean_terminated_length": 738.1012939453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 207.6,
"epoch": 0.3413333333333333,
"grad_norm": 0.00019207321747671813,
"learning_rate": 1.9786096256684497e-06,
"loss": -0.0167,
"num_tokens": 277180582.0,
"reward": 1.1334651947021483,
"reward_std": 0.1689860612154007,
"rewards/accuracy_reward": 0.55,
"rewards/brier_reward": 0.7361587643623352,
"rewards/confidence_one_or_zero": 0.00078125,
"rewards/format_reward": 0.98076171875,
"rewards/mean_confidence_reward": 0.4910940706729889,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01513671875,
"completions/max_length": 3753.4,
"completions/max_terminated_length": 3753.4,
"completions/mean_length": 762.5583984375,
"completions/mean_terminated_length": 774.3962524414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 216.0,
"epoch": 0.352,
"grad_norm": 0.00017471906903665513,
"learning_rate": 1.8449197860962567e-06,
"loss": -0.014,
"num_tokens": 287740188.0,
"reward": 1.1528494119644166,
"reward_std": 0.17534538209438325,
"rewards/accuracy_reward": 0.583984375,
"rewards/brier_reward": 0.7377201795578003,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.983984375,
"rewards/mean_confidence_reward": 0.4953427791595459,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02158203125,
"completions/max_length": 3530.6,
"completions/max_terminated_length": 3530.6,
"completions/mean_length": 787.61787109375,
"completions/mean_terminated_length": 804.80068359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 206.8,
"epoch": 0.3626666666666667,
"grad_norm": 0.0002157771377824247,
"learning_rate": 1.7112299465240642e-06,
"loss": -0.0207,
"num_tokens": 298560467.0,
"reward": 1.1448474168777465,
"reward_std": 0.1869949847459793,
"rewards/accuracy_reward": 0.5779296875,
"rewards/brier_reward": 0.7339232444763184,
"rewards/confidence_one_or_zero": 9.765625e-05,
"rewards/format_reward": 0.97783203125,
"rewards/mean_confidence_reward": 0.4906333088874817,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01767578125,
"completions/max_length": 3371.4,
"completions/max_terminated_length": 3371.4,
"completions/mean_length": 764.63466796875,
"completions/mean_terminated_length": 778.4362426757813,
"completions/min_length": 0.0,
"completions/min_terminated_length": 211.6,
"epoch": 0.37333333333333335,
"grad_norm": 0.00021693432063329965,
"learning_rate": 1.5775401069518716e-06,
"loss": -0.0178,
"num_tokens": 309152310.0,
"reward": 1.1474551439285279,
"reward_std": 0.18755724132061005,
"rewards/accuracy_reward": 0.57763671875,
"rewards/brier_reward": 0.7359159588813782,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98134765625,
"rewards/mean_confidence_reward": 0.49203906655311586,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02353515625,
"completions/max_length": 3407.4,
"completions/max_terminated_length": 3407.4,
"completions/mean_length": 772.02041015625,
"completions/mean_terminated_length": 790.5445190429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 224.2,
"epoch": 0.384,
"grad_norm": 0.0001946146658156067,
"learning_rate": 1.4438502673796793e-06,
"loss": -0.0226,
"num_tokens": 319806791.0,
"reward": 1.1229839086532594,
"reward_std": 0.17491495311260224,
"rewards/accuracy_reward": 0.5380859375,
"rewards/brier_reward": 0.7317001938819885,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.976171875,
"rewards/mean_confidence_reward": 0.4897919952869415,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0193359375,
"completions/max_length": 3332.2,
"completions/max_terminated_length": 3332.2,
"completions/mean_length": 774.34521484375,
"completions/mean_terminated_length": 789.7105224609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 216.6,
"epoch": 0.39466666666666667,
"grad_norm": 0.00021305667178239673,
"learning_rate": 1.3101604278074868e-06,
"loss": -0.0173,
"num_tokens": 330535798.0,
"reward": 1.1333304166793823,
"reward_std": 0.17513595819473265,
"rewards/accuracy_reward": 0.55146484375,
"rewards/brier_reward": 0.7354985475540161,
"rewards/confidence_one_or_zero": 9.765625e-05,
"rewards/format_reward": 0.9796875,
"rewards/mean_confidence_reward": 0.4905791044235229,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02119140625,
"completions/max_length": 3443.2,
"completions/max_terminated_length": 3443.2,
"completions/mean_length": 769.52978515625,
"completions/mean_terminated_length": 786.377783203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 223.0,
"epoch": 0.4053333333333333,
"grad_norm": 0.00022391976381186396,
"learning_rate": 1.1764705882352942e-06,
"loss": -0.0204,
"num_tokens": 341155271.0,
"reward": 1.139868140220642,
"reward_std": 0.19301997423171996,
"rewards/accuracy_reward": 0.56884765625,
"rewards/brier_reward": 0.7333398222923279,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9775390625,
"rewards/mean_confidence_reward": 0.4906933605670929,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0189453125,
"completions/max_length": 3776.8,
"completions/max_terminated_length": 3776.8,
"completions/mean_length": 771.45654296875,
"completions/mean_terminated_length": 786.5269165039062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 209.4,
"epoch": 0.416,
"grad_norm": 0.000211334801861085,
"learning_rate": 1.0427807486631017e-06,
"loss": -0.0174,
"num_tokens": 351822186.0,
"reward": 1.1492414712905883,
"reward_std": 0.17223585546016693,
"rewards/accuracy_reward": 0.5830078125,
"rewards/brier_reward": 0.7351918578147888,
"rewards/confidence_one_or_zero": 9.765625e-05,
"rewards/format_reward": 0.9802734375,
"rewards/mean_confidence_reward": 0.48971192836761473,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02529296875,
"completions/max_length": 4013.0,
"completions/max_terminated_length": 4013.0,
"completions/mean_length": 776.52587890625,
"completions/mean_terminated_length": 796.702392578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 232.0,
"epoch": 0.4266666666666667,
"grad_norm": 0.00024326991115231067,
"learning_rate": 9.090909090909091e-07,
"loss": -0.024,
"num_tokens": 362520019.0,
"reward": 1.136240339279175,
"reward_std": 0.19438377916812896,
"rewards/accuracy_reward": 0.56904296875,
"rewards/brier_reward": 0.7294045448303222,
"rewards/confidence_one_or_zero": 9.765625e-05,
"rewards/format_reward": 0.9740234375,
"rewards/mean_confidence_reward": 0.4875830113887787,
"step": 200
},
{
"epoch": 0.4266666666666667,
"eval_completions/clipped_ratio": 0.0205078125,
"eval_completions/max_length": 1878.5,
"eval_completions/max_terminated_length": 1878.5,
"eval_completions/mean_length": 764.9681663513184,
"eval_completions/mean_terminated_length": 780.9868965148926,
"eval_completions/min_length": 96.25,
"eval_completions/min_terminated_length": 297.125,
"eval_loss": 0.0,
"eval_num_tokens": 362520019.0,
"eval_reward": 1.1360864490270615,
"eval_reward_std": 0.2948375102132559,
"eval_rewards/accuracy_reward": 0.560546875,
"eval_rewards/brier_reward": 0.7340771444141865,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.9775390625,
"eval_rewards/mean_confidence_reward": 0.4896484352648258,
"eval_runtime": 323.2058,
"eval_samples_per_second": 3.094,
"eval_steps_per_second": 0.05,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02255859375,
"completions/max_length": 3584.8,
"completions/max_terminated_length": 3584.8,
"completions/mean_length": 765.71416015625,
"completions/mean_terminated_length": 783.431103515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 215.8,
"epoch": 0.43733333333333335,
"grad_norm": 0.00024016403767745942,
"learning_rate": 7.754010695187167e-07,
"loss": -0.0218,
"num_tokens": 373108388.0,
"reward": 1.1395012855529785,
"reward_std": 0.18607062399387359,
"rewards/accuracy_reward": 0.56943359375,
"rewards/brier_reward": 0.732899010181427,
"rewards/confidence_one_or_zero": 9.765625e-05,
"rewards/format_reward": 0.97666015625,
"rewards/mean_confidence_reward": 0.48821924328804017,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02265625,
"completions/max_length": 3176.4,
"completions/max_terminated_length": 3176.4,
"completions/mean_length": 786.155078125,
"completions/mean_terminated_length": 804.3975830078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 215.4,
"epoch": 0.448,
"grad_norm": 0.00021111531532369554,
"learning_rate": 6.417112299465242e-07,
"loss": -0.0192,
"num_tokens": 383926040.0,
"reward": 1.1250059604644775,
"reward_std": 0.19036045372486116,
"rewards/accuracy_reward": 0.54189453125,
"rewards/brier_reward": 0.7315450549125672,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9765625,
"rewards/mean_confidence_reward": 0.4895175814628601,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.026171875,
"completions/max_length": 3783.6,
"completions/max_terminated_length": 3783.6,
"completions/mean_length": 765.16962890625,
"completions/mean_terminated_length": 785.7530517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 227.4,
"epoch": 0.45866666666666667,
"grad_norm": 0.00020393490558490157,
"learning_rate": 5.080213903743316e-07,
"loss": -0.0257,
"num_tokens": 394495009.0,
"reward": 1.1425009727478028,
"reward_std": 0.19886362850666045,
"rewards/accuracy_reward": 0.58173828125,
"rewards/brier_reward": 0.7300117135047912,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9732421875,
"rewards/mean_confidence_reward": 0.48793848156929015,
"step": 215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0228515625,
"completions/max_length": 3656.6,
"completions/max_terminated_length": 3656.6,
"completions/mean_length": 780.3501953125,
"completions/mean_terminated_length": 798.559521484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 234.6,
"epoch": 0.4693333333333333,
"grad_norm": 0.00024207662499975413,
"learning_rate": 3.7433155080213904e-07,
"loss": -0.0225,
"num_tokens": 405219299.0,
"reward": 1.1340527296066285,
"reward_std": 0.1835294783115387,
"rewards/accuracy_reward": 0.559765625,
"rewards/brier_reward": 0.731767475605011,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9765625,
"rewards/mean_confidence_reward": 0.4893447160720825,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0205078125,
"completions/max_length": 3377.4,
"completions/max_terminated_length": 3377.4,
"completions/mean_length": 782.55126953125,
"completions/mean_terminated_length": 798.9751098632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 226.4,
"epoch": 0.48,
"grad_norm": 0.00020044122356921434,
"learning_rate": 2.4064171122994655e-07,
"loss": -0.021,
"num_tokens": 415955856.0,
"reward": 1.140486478805542,
"reward_std": 0.1799767643213272,
"rewards/accuracy_reward": 0.566796875,
"rewards/brier_reward": 0.7353576302528382,
"rewards/confidence_one_or_zero": 0.0001953125,
"rewards/format_reward": 0.97880859375,
"rewards/mean_confidence_reward": 0.490503066778183,
"step": 225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02333984375,
"completions/max_length": 3544.2,
"completions/max_terminated_length": 3544.2,
"completions/mean_length": 787.57099609375,
"completions/mean_terminated_length": 806.4834838867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.49066666666666664,
"grad_norm": 0.00020212125673424453,
"learning_rate": 1.0695187165775401e-07,
"loss": -0.0235,
"num_tokens": 426747335.0,
"reward": 1.1553166151046752,
"reward_std": 0.19148518443107604,
"rewards/accuracy_reward": 0.60361328125,
"rewards/brier_reward": 0.7308382272720337,
"rewards/confidence_one_or_zero": 9.765625e-05,
"rewards/format_reward": 0.976171875,
"rewards/mean_confidence_reward": 0.48845637440681455,
"step": 230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.018798828125,
"completions/max_length": 3411.5,
"completions/max_terminated_length": 3411.5,
"completions/mean_length": 780.903076171875,
"completions/mean_terminated_length": 796.0233154296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 200.5,
"epoch": 0.4992,
"num_tokens": 435349741.0,
"reward": 1.1434746384620667,
"reward_std": 0.18124820291996002,
"rewards/accuracy_reward": 0.5712890625,
"rewards/brier_reward": 0.7353035807609558,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9803466796875,
"rewards/mean_confidence_reward": 0.49278198927640915,
"step": 234,
"total_flos": 0.0,
"train_loss": -0.01013354094237344,
"train_runtime": 38659.6234,
"train_samples_per_second": 0.388,
"train_steps_per_second": 0.006
}
],
"logging_steps": 5,
"max_steps": 234,
"num_input_tokens_seen": 435349741,
"num_train_epochs": 1,
"save_steps": 60,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}