{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5008, "eval_steps": 15, "global_step": 313, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02669270833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4017.0, "completions/mean_length": 578.3873901367188, "completions/mean_terminated_length": 481.917724609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0016, "grad_norm": 0.004069434478878975, "learning_rate": 7.936507936507937e-08, "loss": 0.0132, "num_tokens": 1142099.0, "reward": 0.3935546875, "reward_std": 0.36991819739341736, "rewards/accuracy_reward": 0.2591145932674408, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.5279948115348816, "rewards/mean_confidence_reward": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01432291666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 545.4765625, "completions/mean_terminated_length": 493.8837585449219, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0032, "grad_norm": 0.003504666266962886, "learning_rate": 1.5873015873015874e-07, "loss": 0.0201, "num_tokens": 2228911.0, "reward": 0.3837890625, "reward_std": 0.35765132308006287, "rewards/accuracy_reward": 0.23828125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.529296875, "rewards/mean_confidence_reward": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01692708333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3800.0, "completions/mean_length": 551.2428588867188, "completions/mean_terminated_length": 490.2073059082031, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0048, "grad_norm": 0.0028648818843066692, "learning_rate": 2.3809523809523811e-07, "loss": 0.0167, "num_tokens": 3334372.0, "reward": 0.3883463740348816, "reward_std": 0.3612661361694336, "rewards/accuracy_reward": 0.228515625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.5481770634651184, "rewards/mean_confidence_reward": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02473958333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3682.0, "completions/mean_length": 585.3483276367188, "completions/mean_terminated_length": 496.2930908203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0064, "grad_norm": 0.002415076596662402, "learning_rate": 3.174603174603175e-07, "loss": 0.0176, "num_tokens": 4488091.0, "reward": 0.3935546875, "reward_std": 0.3514183759689331, "rewards/accuracy_reward": 0.2389322966337204, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.5481770634651184, "rewards/mean_confidence_reward": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4035.0, "completions/mean_length": 628.208984375, "completions/mean_terminated_length": 530.7208862304688, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.008, "grad_norm": 0.003127189353108406, "learning_rate": 3.9682539682539683e-07, "loss": 0.0201, "num_tokens": 5711164.0, "reward": 0.392578125, "reward_std": 0.36937087774276733, "rewards/accuracy_reward": 0.2447916716337204, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.5403645634651184, "rewards/mean_confidence_reward": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02018229166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3941.0, "completions/mean_length": 580.3392333984375, "completions/mean_terminated_length": 507.9235534667969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0096, "grad_norm": 0.003027706639841199, "learning_rate": 4.7619047619047623e-07, "loss": 0.0159, "num_tokens": 6868453.0, "reward": 0.3629557490348816, "reward_std": 0.3447269797325134, "rewards/accuracy_reward": 0.2044270783662796, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.521484375, "rewards/mean_confidence_reward": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01888020833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3895.0, "completions/mean_length": 557.626953125, "completions/mean_terminated_length": 489.5361633300781, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0112, "grad_norm": 0.003589223138988018, "learning_rate": 5.555555555555555e-07, "loss": 0.0232, "num_tokens": 7975592.0, "reward": 0.4397786557674408, "reward_std": 0.37646299600601196, "rewards/accuracy_reward": 0.2864583432674408, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.5930989384651184, "rewards/mean_confidence_reward": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01822916666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 552.0319213867188, "completions/mean_terminated_length": 486.228759765625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0128, "grad_norm": 0.004750226624310017, "learning_rate": 6.34920634920635e-07, "loss": 0.023, "num_tokens": 9072857.0, "reward": 0.4417317807674408, "reward_std": 0.3572891354560852, "rewards/accuracy_reward": 0.2975260317325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.5859375, "rewards/mean_confidence_reward": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3898.0, "completions/mean_length": 582.044921875, "completions/mean_terminated_length": 504.8922119140625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0144, "grad_norm": 0.0035954380873590708, "learning_rate": 7.142857142857143e-07, "loss": 0.0232, "num_tokens": 10214398.0, "reward": 0.4446614682674408, "reward_std": 0.34959179162979126, "rewards/accuracy_reward": 0.2701822817325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.619140625, "rewards/mean_confidence_reward": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01302083333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 510.265625, "completions/mean_terminated_length": 462.96044921875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.016, "grad_norm": 0.19951961934566498, "learning_rate": 7.936507936507937e-07, "loss": 0.0201, "num_tokens": 11260278.0, "reward": 0.513671875, "reward_std": 0.33598825335502625, "rewards/accuracy_reward": 0.3001302182674408, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.7272135615348816, "rewards/mean_confidence_reward": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01822916666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3645.0, "completions/mean_length": 585.01171875, "completions/mean_terminated_length": 519.8209228515625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0176, "grad_norm": 0.0024398418609052896, "learning_rate": 8.73015873015873e-07, "loss": 0.0223, "num_tokens": 12417384.0, "reward": 0.5211588740348816, "reward_std": 0.34508538246154785, "rewards/accuracy_reward": 0.3235677182674408, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.71875, "rewards/mean_confidence_reward": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 475.22723388671875, "completions/mean_terminated_length": 434.7050476074219, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0192, "grad_norm": 0.0016301465220749378, "learning_rate": 9.523809523809525e-07, "loss": 0.0137, "num_tokens": 13419333.0, "reward": 0.5641276240348816, "reward_std": 0.31557729840278625, "rewards/accuracy_reward": 0.3307291567325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.7975260615348816, "rewards/mean_confidence_reward": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3864.0, "completions/mean_length": 409.70184326171875, "completions/mean_terminated_length": 368.44635009765625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.0208, "grad_norm": 0.0015007412293925881, "learning_rate": 1.0317460317460317e-06, "loss": 0.0189, "num_tokens": 14301211.0, "reward": 0.6136068105697632, "reward_std": 0.2852426767349243, "rewards/accuracy_reward": 0.3404947817325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.88671875, "rewards/mean_confidence_reward": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3396.0, "completions/mean_length": 396.4544372558594, "completions/mean_terminated_length": 355.0506896972656, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0224, "grad_norm": 0.0009811328491196036, "learning_rate": 1.111111111111111e-06, "loss": 0.0116, "num_tokens": 15146101.0, "reward": 0.6634114980697632, "reward_std": 0.27086055278778076, "rewards/accuracy_reward": 0.4140625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9127604365348816, "rewards/mean_confidence_reward": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3856.0, "completions/mean_length": 375.345703125, "completions/mean_terminated_length": 346.0492248535156, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.024, "grad_norm": 0.001291697146371007, "learning_rate": 1.1904761904761906e-06, "loss": 0.0133, "num_tokens": 15954696.0, "reward": 0.6689453125, "reward_std": 0.2637009620666504, "rewards/accuracy_reward": 0.412109375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.92578125, "rewards/mean_confidence_reward": 0.0, "step": 15 }, { "epoch": 0.024, "eval_completions/clipped_ratio": 0.012920673076923073, "eval_completions/max_length": 4096.0, "eval_completions/max_terminated_length": 2134.375, "eval_completions/mean_length": 441.61809158325195, "eval_completions/mean_terminated_length": 393.7573356628418, "eval_completions/min_length": 52.875, "eval_completions/min_terminated_length": 52.875, "eval_loss": 0.0, "eval_num_tokens": 15954696.0, "eval_reward": 0.65478515625, "eval_reward_std": 0.2928590625524521, "eval_rewards/accuracy_reward": 0.375, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9345703125, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 258.4809, "eval_samples_per_second": 3.869, "eval_steps_per_second": 0.031, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2902.0, "completions/mean_length": 438.7005310058594, "completions/mean_terminated_length": 380.64813232421875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0256, "grad_norm": 0.0011755065061151981, "learning_rate": 1.26984126984127e-06, "loss": 0.014, "num_tokens": 16875676.0, "reward": 0.6429036855697632, "reward_std": 0.26328733563423157, "rewards/accuracy_reward": 0.3639322817325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.921875, "rewards/mean_confidence_reward": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3955.0, "completions/mean_length": 394.6692810058594, "completions/mean_terminated_length": 363.07550048828125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0272, "grad_norm": 0.0008988995687104762, "learning_rate": 1.3492063492063493e-06, "loss": 0.0127, "num_tokens": 17721440.0, "reward": 0.712890625, "reward_std": 0.2529163062572479, "rewards/accuracy_reward": 0.4739583432674408, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9518229365348816, "rewards/mean_confidence_reward": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3906.0, "completions/mean_length": 414.04559326171875, "completions/mean_terminated_length": 385.0538024902344, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.0288, "grad_norm": 0.0007314863614737988, "learning_rate": 1.4285714285714286e-06, "loss": 0.0069, "num_tokens": 18610726.0, "reward": 0.6565755605697632, "reward_std": 0.22693131864070892, "rewards/accuracy_reward": 0.3528645932674408, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9602864384651184, "rewards/mean_confidence_reward": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3801.0, "completions/mean_length": 447.16796875, "completions/mean_terminated_length": 403.90118408203125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0304, "grad_norm": 0.000571794924326241, "learning_rate": 1.507936507936508e-06, "loss": 0.011, "num_tokens": 19552968.0, "reward": 0.7083333730697632, "reward_std": 0.21525681018829346, "rewards/accuracy_reward": 0.4407552182674408, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9759114384651184, "rewards/mean_confidence_reward": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01302083333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3822.0, "completions/mean_length": 482.12371826171875, "completions/mean_terminated_length": 434.4472351074219, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.032, "grad_norm": 0.00046746429870836437, "learning_rate": 1.5873015873015873e-06, "loss": 0.0132, "num_tokens": 20560486.0, "reward": 0.6634114980697632, "reward_std": 0.21357280015945435, "rewards/accuracy_reward": 0.3548177182674408, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9720051884651184, "rewards/mean_confidence_reward": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3902.0, "completions/mean_length": 399.27734375, "completions/mean_terminated_length": 372.6124572753906, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0336, "grad_norm": 0.0006239061476662755, "learning_rate": 1.6666666666666667e-06, "loss": 0.0045, "num_tokens": 21426448.0, "reward": 0.6591796875, "reward_std": 0.181607186794281, "rewards/accuracy_reward": 0.337890625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98046875, "rewards/mean_confidence_reward": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3217.0, "completions/mean_length": 437.21484375, "completions/mean_terminated_length": 403.5597839355469, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.0352, "grad_norm": 0.0005117854452691972, "learning_rate": 1.746031746031746e-06, "loss": 0.0042, "num_tokens": 22361434.0, "reward": 0.6695963740348816, "reward_std": 0.2032264918088913, "rewards/accuracy_reward": 0.3561197817325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9830729365348816, "rewards/mean_confidence_reward": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3687.0, "completions/mean_length": 433.15496826171875, "completions/mean_terminated_length": 406.73443603515625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.0368, "grad_norm": 0.0005601196317002177, "learning_rate": 1.8253968253968254e-06, "loss": 0.0034, "num_tokens": 23282184.0, "reward": 0.7135416865348816, "reward_std": 0.21026454865932465, "rewards/accuracy_reward": 0.443359375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9837239384651184, "rewards/mean_confidence_reward": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 402.15496826171875, "completions/mean_terminated_length": 380.3837585449219, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.0384, "grad_norm": 0.0005225238855928183, "learning_rate": 1.904761904761905e-06, "loss": 0.0009, "num_tokens": 24150934.0, "reward": 0.7151693105697632, "reward_std": 0.1983954906463623, "rewards/accuracy_reward": 0.4407552182674408, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9895833134651184, "rewards/mean_confidence_reward": 0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3834.0, "completions/mean_length": 475.97137451171875, "completions/mean_terminated_length": 435.45751953125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.04, "grad_norm": 0.00042087462497875094, "learning_rate": 1.984126984126984e-06, "loss": 0.0073, "num_tokens": 25141578.0, "reward": 0.697265625, "reward_std": 0.20233683288097382, "rewards/accuracy_reward": 0.41015625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.984375, "rewards/mean_confidence_reward": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3937.0, "completions/mean_length": 473.21484375, "completions/mean_terminated_length": 435.0802917480469, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0416, "grad_norm": 0.00041561329271644354, "learning_rate": 2.0634920634920634e-06, "loss": 0.0121, "num_tokens": 26127316.0, "reward": 0.69921875, "reward_std": 0.17708146572113037, "rewards/accuracy_reward": 0.4127604067325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9856770634651184, "rewards/mean_confidence_reward": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3462.0, "completions/mean_length": 473.30078125, "completions/mean_terminated_length": 459.0941162109375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.0432, "grad_norm": 0.0004529333091340959, "learning_rate": 2.1428571428571427e-06, "loss": 0.002, "num_tokens": 27112834.0, "reward": 0.732421875, "reward_std": 0.19999021291732788, "rewards/accuracy_reward": 0.4713541567325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.0, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 399.91015625, "completions/mean_terminated_length": 373.2498474121094, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.0448, "grad_norm": 0.0005229237140156329, "learning_rate": 2.222222222222222e-06, "loss": 0.0052, "num_tokens": 27980216.0, "reward": 0.7809244990348816, "reward_std": 0.18244394659996033, "rewards/accuracy_reward": 0.5709635615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3285.0, "completions/mean_length": 505.5032653808594, "completions/mean_terminated_length": 474.8555603027344, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.0464, "grad_norm": 0.0004277468251530081, "learning_rate": 2.301587301587302e-06, "loss": 0.0085, "num_tokens": 29010269.0, "reward": 0.7594401240348816, "reward_std": 0.21015426516532898, "rewards/accuracy_reward": 0.5319010615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9869791865348816, "rewards/mean_confidence_reward": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3960.0, "completions/mean_length": 463.9446716308594, "completions/mean_terminated_length": 435.3457946777344, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.048, "grad_norm": 0.00036658241879194975, "learning_rate": 2.380952380952381e-06, "loss": 0.0071, "num_tokens": 29973800.0, "reward": 0.7454427480697632, "reward_std": 0.15918828547000885, "rewards/accuracy_reward": 0.5, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.0, "step": 30 }, { "epoch": 0.048, "eval_completions/clipped_ratio": 0.012920673076923073, "eval_completions/max_length": 4096.0, "eval_completions/max_terminated_length": 2479.125, "eval_completions/mean_length": 525.2048568725586, "eval_completions/mean_terminated_length": 478.407527923584, "eval_completions/min_length": 115.75, "eval_completions/min_terminated_length": 115.75, "eval_loss": 0.0, "eval_num_tokens": 29973800.0, "eval_reward": 0.767578125, "eval_reward_std": 0.2635391540825367, "eval_rewards/accuracy_reward": 0.5517578125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9833984375, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 259.4187, "eval_samples_per_second": 3.855, "eval_steps_per_second": 0.031, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01822916666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 590.5234375, "completions/mean_terminated_length": 525.4349975585938, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.0496, "grad_norm": 0.00036120781442150474, "learning_rate": 2.4603174603174605e-06, "loss": 0.0134, "num_tokens": 31134636.0, "reward": 0.7301432490348816, "reward_std": 0.17832188308238983, "rewards/accuracy_reward": 0.482421875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9778645634651184, "rewards/mean_confidence_reward": 0.0, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 500.84246826171875, "completions/mean_terminated_length": 477.2831115722656, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.0512, "grad_norm": 0.0005704615614376962, "learning_rate": 2.53968253968254e-06, "loss": 0.0054, "num_tokens": 32146010.0, "reward": 0.75, "reward_std": 0.17993192374706268, "rewards/accuracy_reward": 0.5084635615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.0, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 546.740234375, "completions/mean_terminated_length": 509.379638671875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.0528, "grad_norm": 0.00035105724236927927, "learning_rate": 2.6190476190476192e-06, "loss": 0.0099, "num_tokens": 33231051.0, "reward": 0.7897135615348816, "reward_std": 0.16787074506282806, "rewards/accuracy_reward": 0.595703125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9837239384651184, "rewards/mean_confidence_reward": 0.0, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4057.0, "completions/mean_length": 516.3112182617188, "completions/mean_terminated_length": 478.6302795410156, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.0544, "grad_norm": 0.0003981503250543028, "learning_rate": 2.6984126984126986e-06, "loss": 0.0124, "num_tokens": 34293769.0, "reward": 0.806640625, "reward_std": 0.18746080994606018, "rewards/accuracy_reward": 0.6263020634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9869791865348816, "rewards/mean_confidence_reward": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3958.0, "completions/mean_length": 548.501953125, "completions/mean_terminated_length": 532.2609252929688, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.056, "grad_norm": 0.0002942201681435108, "learning_rate": 2.7777777777777783e-06, "loss": 0.0046, "num_tokens": 35402156.0, "reward": 0.8541666865348816, "reward_std": 0.13000959157943726, "rewards/accuracy_reward": 0.7135416865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3906.0, "completions/mean_length": 531.2259521484375, "completions/mean_terminated_length": 517.2463989257812, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.0576, "grad_norm": 0.0002970800269395113, "learning_rate": 2.8571428571428573e-06, "loss": 0.0063, "num_tokens": 36470407.0, "reward": 0.8645833730697632, "reward_std": 0.1243864893913269, "rewards/accuracy_reward": 0.7350260615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 556.23828125, "completions/mean_terminated_length": 533.0419311523438, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.0592, "grad_norm": 0.00032798058236949146, "learning_rate": 2.936507936507937e-06, "loss": 0.0059, "num_tokens": 37592821.0, "reward": 0.7698568105697632, "reward_std": 0.16345205903053284, "rewards/accuracy_reward": 0.5475260615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01627604166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4058.0, "completions/mean_length": 586.8626708984375, "completions/mean_terminated_length": 528.8027954101562, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.0608, "grad_norm": 0.0003272242611274123, "learning_rate": 3.015873015873016e-06, "loss": 0.0147, "num_tokens": 38750178.0, "reward": 0.8277994990348816, "reward_std": 0.1604650914669037, "rewards/accuracy_reward": 0.677734375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9778645634651184, "rewards/mean_confidence_reward": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4042.0, "completions/mean_length": 605.212890625, "completions/mean_terminated_length": 584.6384887695312, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.0624, "grad_norm": 0.00034811426303349435, "learning_rate": 3.0952380952380957e-06, "loss": 0.0088, "num_tokens": 39936457.0, "reward": 0.7454427480697632, "reward_std": 0.18961429595947266, "rewards/accuracy_reward": 0.5006510615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01627604166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 658.4661865234375, "completions/mean_terminated_length": 601.5910034179688, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.064, "grad_norm": 0.00030578888254240155, "learning_rate": 3.1746031746031746e-06, "loss": 0.0178, "num_tokens": 41215029.0, "reward": 0.7880859375, "reward_std": 0.18464241921901703, "rewards/accuracy_reward": 0.59765625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.978515625, "rewards/mean_confidence_reward": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4042.0, "completions/mean_length": 573.9915771484375, "completions/mean_terminated_length": 539.2577514648438, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.0656, "grad_norm": 0.0003540613397490233, "learning_rate": 3.2539682539682544e-06, "loss": 0.0105, "num_tokens": 42344008.0, "reward": 0.8391927480697632, "reward_std": 0.14466650784015656, "rewards/accuracy_reward": 0.6907551884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9876301884651184, "rewards/mean_confidence_reward": 0.0, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01236979166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3911.0, "completions/mean_length": 636.943359375, "completions/mean_terminated_length": 593.61962890625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.0672, "grad_norm": 0.000313192285830155, "learning_rate": 3.3333333333333333e-06, "loss": 0.013, "num_tokens": 43592305.0, "reward": 0.8212890625, "reward_std": 0.16263310611248016, "rewards/accuracy_reward": 0.6569010615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9856770634651184, "rewards/mean_confidence_reward": 0.0, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3958.0, "completions/mean_length": 571.6666870117188, "completions/mean_terminated_length": 536.9099731445312, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.0688, "grad_norm": 0.000282664637779817, "learning_rate": 3.412698412698413e-06, "loss": 0.0165, "num_tokens": 44716209.0, "reward": 0.8072916865348816, "reward_std": 0.14460304379463196, "rewards/accuracy_reward": 0.62890625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9856770634651184, "rewards/mean_confidence_reward": 0.0, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01497395833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 651.8776245117188, "completions/mean_terminated_length": 599.521484375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.0704, "grad_norm": 0.00025070359697565436, "learning_rate": 3.492063492063492e-06, "loss": 0.0093, "num_tokens": 45970421.0, "reward": 0.7444661855697632, "reward_std": 0.14321261644363403, "rewards/accuracy_reward": 0.5071614384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9817708134651184, "rewards/mean_confidence_reward": 0.0, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3905.0, "completions/mean_length": 629.1569213867188, "completions/mean_terminated_length": 594.9671630859375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.072, "grad_norm": 0.00021849016775377095, "learning_rate": 3.5714285714285718e-06, "loss": 0.0081, "num_tokens": 47182214.0, "reward": 0.8157552480697632, "reward_std": 0.11393491923809052, "rewards/accuracy_reward": 0.64453125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9869791865348816, "rewards/mean_confidence_reward": 0.0, "step": 45 }, { "epoch": 0.072, "eval_completions/clipped_ratio": 0.009765625, "eval_completions/max_length": 3791.375, "eval_completions/max_terminated_length": 2677.5, "eval_completions/mean_length": 618.7805786132812, "eval_completions/mean_terminated_length": 584.5479583740234, "eval_completions/min_length": 172.625, "eval_completions/min_terminated_length": 172.625, "eval_loss": 0.0, "eval_num_tokens": 47182214.0, "eval_reward": 0.806640625, "eval_reward_std": 0.25718154944479465, "eval_rewards/accuracy_reward": 0.626953125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.986328125, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 242.5183, "eval_samples_per_second": 4.123, "eval_steps_per_second": 0.033, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 644.375, "completions/mean_terminated_length": 605.745849609375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.0736, "grad_norm": 0.0002685417130123824, "learning_rate": 3.6507936507936507e-06, "loss": 0.0099, "num_tokens": 48419078.0, "reward": 0.7688802480697632, "reward_std": 0.15912975370883942, "rewards/accuracy_reward": 0.5546875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9830729365348816, "rewards/mean_confidence_reward": 0.0, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 719.2916870117188, "completions/mean_terminated_length": 679.2516479492188, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.0752, "grad_norm": 0.0002579140127636492, "learning_rate": 3.7301587301587305e-06, "loss": 0.0133, "num_tokens": 49780102.0, "reward": 0.7903646230697632, "reward_std": 0.14441931247711182, "rewards/accuracy_reward": 0.6028645634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9778645634651184, "rewards/mean_confidence_reward": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4040.0, "completions/mean_length": 528.9557495117188, "completions/mean_terminated_length": 510.2801208496094, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.0768, "grad_norm": 0.00030686004902236164, "learning_rate": 3.80952380952381e-06, "loss": 0.0036, "num_tokens": 50837762.0, "reward": 0.7994791865348816, "reward_std": 0.14366571605205536, "rewards/accuracy_reward": 0.6048176884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02018229166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4054.0, "completions/mean_length": 642.533203125, "completions/mean_terminated_length": 571.3986206054688, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.0784, "grad_norm": 0.00026491357129998505, "learning_rate": 3.88888888888889e-06, "loss": 0.0112, "num_tokens": 52073717.0, "reward": 0.8528646230697632, "reward_std": 0.13388270139694214, "rewards/accuracy_reward": 0.7278645634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9778645634651184, "rewards/mean_confidence_reward": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3912.0, "completions/mean_length": 574.0573120117188, "completions/mean_terminated_length": 536.9842529296875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.08, "grad_norm": 0.0002825928677339107, "learning_rate": 3.968253968253968e-06, "loss": 0.0131, "num_tokens": 53207437.0, "reward": 0.8336588740348816, "reward_std": 0.11251487582921982, "rewards/accuracy_reward": 0.6809895634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.986328125, "rewards/mean_confidence_reward": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4041.0, "completions/mean_length": 604.4407958984375, "completions/mean_terminated_length": 572.3239135742188, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.0816, "grad_norm": 0.0003044702170882374, "learning_rate": 4.047619047619048e-06, "loss": 0.0078, "num_tokens": 54374098.0, "reward": 0.8209635615348816, "reward_std": 0.15141773223876953, "rewards/accuracy_reward": 0.65625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9856770634651184, "rewards/mean_confidence_reward": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4032.0, "completions/mean_length": 654.951171875, "completions/mean_terminated_length": 623.2989501953125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.0832, "grad_norm": 0.00044901532237417996, "learning_rate": 4.126984126984127e-06, "loss": 0.0117, "num_tokens": 55642983.0, "reward": 0.8193359375, "reward_std": 0.15567922592163086, "rewards/accuracy_reward": 0.6516926884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9869791865348816, "rewards/mean_confidence_reward": 0.0, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 595.3197021484375, "completions/mean_terminated_length": 581.5914916992188, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.0848, "grad_norm": 0.000326639914419502, "learning_rate": 4.206349206349207e-06, "loss": 0.003, "num_tokens": 56806258.0, "reward": 0.845703125, "reward_std": 0.12496884912252426, "rewards/accuracy_reward": 0.6985676884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 624.6712646484375, "completions/mean_terminated_length": 585.8215942382812, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.0864, "grad_norm": 0.00040903998888097703, "learning_rate": 4.2857142857142855e-06, "loss": 0.0153, "num_tokens": 58014169.0, "reward": 0.8479818105697632, "reward_std": 0.14804205298423767, "rewards/accuracy_reward": 0.7122395634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9837239384651184, "rewards/mean_confidence_reward": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 598.5612182617188, "completions/mean_terminated_length": 566.3902587890625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.088, "grad_norm": 0.0004573469050228596, "learning_rate": 4.365079365079366e-06, "loss": 0.0064, "num_tokens": 59191095.0, "reward": 0.8141276240348816, "reward_std": 0.1163366287946701, "rewards/accuracy_reward": 0.640625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9876301884651184, "rewards/mean_confidence_reward": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3852.0, "completions/mean_length": 605.8216552734375, "completions/mean_terminated_length": 598.9915161132812, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.0896, "grad_norm": 0.0004898729384876788, "learning_rate": 4.444444444444444e-06, "loss": 0.0016, "num_tokens": 60377189.0, "reward": 0.7962239980697632, "reward_std": 0.13314379751682281, "rewards/accuracy_reward": 0.5950520634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01432291666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 681.6588745117188, "completions/mean_terminated_length": 632.044921875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.0912, "grad_norm": 0.0004666224995162338, "learning_rate": 4.523809523809524e-06, "loss": 0.0122, "num_tokens": 61685337.0, "reward": 0.8118489980697632, "reward_std": 0.12811368703842163, "rewards/accuracy_reward": 0.6438801884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9798176884651184, "rewards/mean_confidence_reward": 0.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4035.0, "completions/mean_length": 593.015625, "completions/mean_terminated_length": 556.1421508789062, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.0928, "grad_norm": 0.0003643535019364208, "learning_rate": 4.603174603174604e-06, "loss": 0.0102, "num_tokens": 62848561.0, "reward": 0.8421224355697632, "reward_std": 0.13019959628582, "rewards/accuracy_reward": 0.6979166865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.986328125, "rewards/mean_confidence_reward": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 4019.0, "completions/mean_length": 631.9244995117188, "completions/mean_terminated_length": 627.4080810546875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.0944, "grad_norm": 0.0003547768574208021, "learning_rate": 4.682539682539683e-06, "loss": 0.0048, "num_tokens": 64096829.0, "reward": 0.8277994990348816, "reward_std": 0.12740173935890198, "rewards/accuracy_reward": 0.658203125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3979.0, "completions/mean_length": 631.4401245117188, "completions/mean_terminated_length": 604.1600952148438, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.096, "grad_norm": 0.00024742772802710533, "learning_rate": 4.761904761904762e-06, "loss": 0.0069, "num_tokens": 65315457.0, "reward": 0.8186849355697632, "reward_std": 0.12658625841140747, "rewards/accuracy_reward": 0.6471354365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.0, "step": 60 }, { "epoch": 0.096, "eval_completions/clipped_ratio": 0.0029296875, "eval_completions/max_length": 2715.25, "eval_completions/max_terminated_length": 2122.75, "eval_completions/mean_length": 608.9176712036133, "eval_completions/mean_terminated_length": 598.6858749389648, "eval_completions/min_length": 168.125, "eval_completions/min_terminated_length": 168.125, "eval_loss": 0.0, "eval_num_tokens": 65315457.0, "eval_reward": 0.8193359375, "eval_reward_std": 0.2435612753033638, "eval_rewards/accuracy_reward": 0.642578125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.99609375, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 176.7374, "eval_samples_per_second": 5.658, "eval_steps_per_second": 0.045, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3401.0, "completions/mean_length": 593.6810302734375, "completions/mean_terminated_length": 582.2429809570312, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.0976, "grad_norm": 0.00028685847064480186, "learning_rate": 4.841269841269842e-06, "loss": 0.0014, "num_tokens": 66491095.0, "reward": 0.8385416865348816, "reward_std": 0.1227656826376915, "rewards/accuracy_reward": 0.6809895634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 625.0026245117188, "completions/mean_terminated_length": 602.2568969726562, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.0992, "grad_norm": 0.0002773251908365637, "learning_rate": 4.920634920634921e-06, "loss": 0.0054, "num_tokens": 67706331.0, "reward": 0.8502604365348816, "reward_std": 0.11784438788890839, "rewards/accuracy_reward": 0.70703125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4005.0, "completions/mean_length": 626.279296875, "completions/mean_terminated_length": 612.6725463867188, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.1008, "grad_norm": 0.0002715775335673243, "learning_rate": 5e-06, "loss": 0.0035, "num_tokens": 68931816.0, "reward": 0.828125, "reward_std": 0.12496759742498398, "rewards/accuracy_reward": 0.6686198115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9876301884651184, "rewards/mean_confidence_reward": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 589.044921875, "completions/mean_terminated_length": 568.375244140625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.1024, "grad_norm": 0.00031587263219989836, "learning_rate": 4.980000000000001e-06, "loss": 0.0077, "num_tokens": 70093709.0, "reward": 0.8782552480697632, "reward_std": 0.10366225242614746, "rewards/accuracy_reward": 0.7669270634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9895833134651184, "rewards/mean_confidence_reward": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3912.0, "completions/mean_length": 596.2513427734375, "completions/mean_terminated_length": 568.6942138671875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.104, "grad_norm": 0.0003634801250882447, "learning_rate": 4.960000000000001e-06, "loss": 0.0092, "num_tokens": 71258191.0, "reward": 0.8326823115348816, "reward_std": 0.13718102872371674, "rewards/accuracy_reward": 0.67578125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9895833134651184, "rewards/mean_confidence_reward": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4021.0, "completions/mean_length": 697.556640625, "completions/mean_terminated_length": 675.286376953125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.1056, "grad_norm": 0.0004307759809307754, "learning_rate": 4.94e-06, "loss": 0.0095, "num_tokens": 72587366.0, "reward": 0.7705078125, "reward_std": 0.15608450770378113, "rewards/accuracy_reward": 0.5540364384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9869791865348816, "rewards/mean_confidence_reward": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3306.0, "completions/mean_length": 640.451171875, "completions/mean_terminated_length": 631.4288330078125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.1072, "grad_norm": 0.00039831711910665035, "learning_rate": 4.92e-06, "loss": 0.0013, "num_tokens": 73833819.0, "reward": 0.783203125, "reward_std": 0.10659673810005188, "rewards/accuracy_reward": 0.5696614384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3449.0, "completions/mean_length": 582.7708740234375, "completions/mean_terminated_length": 571.2971801757812, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.1088, "grad_norm": 0.0003034260298591107, "learning_rate": 4.9000000000000005e-06, "loss": 0.0046, "num_tokens": 74988763.0, "reward": 0.8365885615348816, "reward_std": 0.12030304968357086, "rewards/accuracy_reward": 0.677734375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3981.0, "completions/mean_length": 680.7806396484375, "completions/mean_terminated_length": 671.8635864257812, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.1104, "grad_norm": 0.0002551264187786728, "learning_rate": 4.880000000000001e-06, "loss": 0.007, "num_tokens": 76286794.0, "reward": 0.7815755605697632, "reward_std": 0.10738813877105713, "rewards/accuracy_reward": 0.56640625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 669.4954833984375, "completions/mean_terminated_length": 665.0280151367188, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.112, "grad_norm": 0.0002691724512260407, "learning_rate": 4.86e-06, "loss": 0.0067, "num_tokens": 77568899.0, "reward": 0.8180338740348816, "reward_std": 0.13050046563148499, "rewards/accuracy_reward": 0.6393229365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3971.0, "completions/mean_length": 590.6732177734375, "completions/mean_terminated_length": 576.9268188476562, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.1136, "grad_norm": 0.00026899558724835515, "learning_rate": 4.84e-06, "loss": 0.0023, "num_tokens": 78723629.0, "reward": 0.8330078125, "reward_std": 0.11817143857479095, "rewards/accuracy_reward": 0.6705729365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3896.0, "completions/mean_length": 620.8861083984375, "completions/mean_terminated_length": 607.2581787109375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.1152, "grad_norm": 0.00025884059141390026, "learning_rate": 4.8200000000000004e-06, "loss": 0.0073, "num_tokens": 79917726.0, "reward": 0.86328125, "reward_std": 0.1270594298839569, "rewards/accuracy_reward": 0.7317708134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3943.0, "completions/mean_length": 607.3314208984375, "completions/mean_terminated_length": 586.7694702148438, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.1168, "grad_norm": 0.0003034123801626265, "learning_rate": 4.800000000000001e-06, "loss": 0.0038, "num_tokens": 81094619.0, "reward": 0.7819010615348816, "reward_std": 0.12809543311595917, "rewards/accuracy_reward": 0.5716145634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 638.1829833984375, "completions/mean_terminated_length": 608.6677856445312, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.1184, "grad_norm": 0.0002892453921958804, "learning_rate": 4.78e-06, "loss": 0.0109, "num_tokens": 82336692.0, "reward": 0.8297526240348816, "reward_std": 0.1311059594154358, "rewards/accuracy_reward": 0.6744791865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9850260615348816, "rewards/mean_confidence_reward": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01692708333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3774.0, "completions/mean_length": 675.048828125, "completions/mean_terminated_length": 616.14501953125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.12, "grad_norm": 0.0003814563970081508, "learning_rate": 4.76e-06, "loss": 0.0133, "num_tokens": 83641919.0, "reward": 0.7985026240348816, "reward_std": 0.13955065608024597, "rewards/accuracy_reward": 0.615234375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9817708134651184, "rewards/mean_confidence_reward": 0.0, "step": 75 }, { "epoch": 0.12, "eval_completions/clipped_ratio": 0.009014423076923073, "eval_completions/max_length": 3676.875, "eval_completions/max_terminated_length": 2012.75, "eval_completions/mean_length": 618.9544067382812, "eval_completions/mean_terminated_length": 587.3268280029297, "eval_completions/min_length": 145.875, "eval_completions/min_terminated_length": 145.875, "eval_loss": 0.0, "eval_num_tokens": 83641919.0, "eval_reward": 0.82177734375, "eval_reward_std": 0.2460994180291891, "eval_rewards/accuracy_reward": 0.65234375, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9912109375, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 236.0921, "eval_samples_per_second": 4.236, "eval_steps_per_second": 0.034, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2398.0, "completions/mean_length": 603.69921875, "completions/mean_terminated_length": 573.8897094726562, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.1216, "grad_norm": 0.00031780439894646406, "learning_rate": 4.74e-06, "loss": 0.0061, "num_tokens": 84811569.0, "reward": 0.8046875, "reward_std": 0.0965360552072525, "rewards/accuracy_reward": 0.6178385615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2143.0, "completions/mean_length": 594.330078125, "completions/mean_terminated_length": 571.3833618164062, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.1232, "grad_norm": 0.0003951648832298815, "learning_rate": 4.7200000000000005e-06, "loss": 0.0068, "num_tokens": 85975884.0, "reward": 0.8447265625, "reward_std": 0.13739246129989624, "rewards/accuracy_reward": 0.6959635615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 627.9453125, "completions/mean_terminated_length": 621.1585083007812, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.1248, "grad_norm": 0.0004775661800522357, "learning_rate": 4.7e-06, "loss": 0.004, "num_tokens": 87192664.0, "reward": 0.7939453125, "reward_std": 0.11887729167938232, "rewards/accuracy_reward": 0.58984375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 661.8001708984375, "completions/mean_terminated_length": 625.6506958007812, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.1264, "grad_norm": 0.00026928301667794585, "learning_rate": 4.680000000000001e-06, "loss": 0.0067, "num_tokens": 88452517.0, "reward": 0.8294271230697632, "reward_std": 0.11433301866054535, "rewards/accuracy_reward": 0.669921875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9889323115348816, "rewards/mean_confidence_reward": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4036.0, "completions/mean_length": 616.0162963867188, "completions/mean_terminated_length": 593.211669921875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.128, "grad_norm": 0.0002962102589663118, "learning_rate": 4.66e-06, "loss": 0.0063, "num_tokens": 89656574.0, "reward": 0.822265625, "reward_std": 0.11920362710952759, "rewards/accuracy_reward": 0.6516926884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3777.0, "completions/mean_length": 617.0443115234375, "completions/mean_terminated_length": 605.6825561523438, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.1296, "grad_norm": 0.00046732689952477813, "learning_rate": 4.6400000000000005e-06, "loss": 0.0087, "num_tokens": 90856130.0, "reward": 0.8675130605697632, "reward_std": 0.13969169557094574, "rewards/accuracy_reward": 0.73828125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 704.154296875, "completions/mean_terminated_length": 686.3959350585938, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.1312, "grad_norm": 0.0002892545599024743, "learning_rate": 4.620000000000001e-06, "loss": 0.0067, "num_tokens": 92194831.0, "reward": 0.80078125, "reward_std": 0.11451419442892075, "rewards/accuracy_reward": 0.6106770634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01432291666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3580.0, "completions/mean_length": 664.8502807617188, "completions/mean_terminated_length": 614.9920654296875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.1328, "grad_norm": 0.0003815610834863037, "learning_rate": 4.600000000000001e-06, "loss": 0.0066, "num_tokens": 93472617.0, "reward": 0.8450521230697632, "reward_std": 0.13120511174201965, "rewards/accuracy_reward": 0.7057291865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.984375, "rewards/mean_confidence_reward": 0.0, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3598.0, "completions/mean_length": 645.689453125, "completions/mean_terminated_length": 627.625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.1344, "grad_norm": 0.0003338145324960351, "learning_rate": 4.58e-06, "loss": 0.004, "num_tokens": 94714316.0, "reward": 0.787109375, "reward_std": 0.12098296731710434, "rewards/accuracy_reward": 0.580078125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 651.544921875, "completions/mean_terminated_length": 624.4232177734375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.136, "grad_norm": 0.0003854295064229518, "learning_rate": 4.56e-06, "loss": 0.0069, "num_tokens": 95962065.0, "reward": 0.7819010615348816, "reward_std": 0.14957842230796814, "rewards/accuracy_reward": 0.5755208134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2413.0, "completions/mean_length": 566.974609375, "completions/mean_terminated_length": 562.37353515625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.1376, "grad_norm": 0.000296604644972831, "learning_rate": 4.540000000000001e-06, "loss": 0.0027, "num_tokens": 97074986.0, "reward": 0.8453776240348816, "reward_std": 0.12123578786849976, "rewards/accuracy_reward": 0.6920573115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 557.8131713867188, "completions/mean_terminated_length": 550.8890991210938, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.1392, "grad_norm": 0.0002939091937150806, "learning_rate": 4.520000000000001e-06, "loss": 0.0041, "num_tokens": 98187755.0, "reward": 0.828125, "reward_std": 0.0909687727689743, "rewards/accuracy_reward": 0.6595051884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3967.0, "completions/mean_length": 616.4095458984375, "completions/mean_terminated_length": 582.0940551757812, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.1408, "grad_norm": 0.0003328786406200379, "learning_rate": 4.5e-06, "loss": 0.0097, "num_tokens": 99385088.0, "reward": 0.7998046875, "reward_std": 0.11913998425006866, "rewards/accuracy_reward": 0.6106770634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9889323115348816, "rewards/mean_confidence_reward": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3993.0, "completions/mean_length": 728.5859375, "completions/mean_terminated_length": 706.51904296875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.1424, "grad_norm": 0.000546226860024035, "learning_rate": 4.48e-06, "loss": 0.0089, "num_tokens": 100765860.0, "reward": 0.7581380605697632, "reward_std": 0.15459588170051575, "rewards/accuracy_reward": 0.5260416865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2145.0, "completions/mean_length": 629.5358276367188, "completions/mean_terminated_length": 620.4849853515625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.144, "grad_norm": 0.00028285328880883753, "learning_rate": 4.4600000000000005e-06, "loss": 0.0029, "num_tokens": 101994011.0, "reward": 0.8365885615348816, "reward_std": 0.11054506152868271, "rewards/accuracy_reward": 0.67578125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 90 }, { "epoch": 0.144, "eval_completions/clipped_ratio": 0.005108173076923073, "eval_completions/max_length": 3371.5, "eval_completions/max_terminated_length": 2263.625, "eval_completions/mean_length": 596.7481231689453, "eval_completions/mean_terminated_length": 578.7095336914062, "eval_completions/min_length": 166.0, "eval_completions/min_terminated_length": 166.0, "eval_loss": 0.0, "eval_num_tokens": 101994011.0, "eval_reward": 0.8232421875, "eval_reward_std": 0.24475175328552723, "eval_rewards/accuracy_reward": 0.65234375, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.994140625, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 216.3937, "eval_samples_per_second": 4.621, "eval_steps_per_second": 0.037, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3382.0, "completions/mean_length": 660.775390625, "completions/mean_terminated_length": 629.1766967773438, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.1456, "grad_norm": 0.00032206380274146795, "learning_rate": 4.440000000000001e-06, "loss": 0.0145, "num_tokens": 103279746.0, "reward": 0.7848307490348816, "reward_std": 0.13898824155330658, "rewards/accuracy_reward": 0.5787760615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4020.0, "completions/mean_length": 566.9381713867188, "completions/mean_terminated_length": 553.0986938476562, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.1472, "grad_norm": 0.0003291277389507741, "learning_rate": 4.42e-06, "loss": 0.0063, "num_tokens": 104397155.0, "reward": 0.8583984375, "reward_std": 0.12042063474655151, "rewards/accuracy_reward": 0.72265625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 556.736328125, "completions/mean_terminated_length": 549.8101806640625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.1488, "grad_norm": 0.0003056551795452833, "learning_rate": 4.4e-06, "loss": 0.0042, "num_tokens": 105498382.0, "reward": 0.8512369990348816, "reward_std": 0.1074962317943573, "rewards/accuracy_reward": 0.7044270634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3868.0, "completions/mean_length": 574.22265625, "completions/mean_terminated_length": 569.6310424804688, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.1504, "grad_norm": 0.0002520487760193646, "learning_rate": 4.38e-06, "loss": 0.0025, "num_tokens": 106633284.0, "reward": 0.8505859375, "reward_std": 0.08086061477661133, "rewards/accuracy_reward": 0.703125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4030.0, "completions/mean_length": 661.4479370117188, "completions/mean_terminated_length": 636.674072265625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.152, "grad_norm": 0.0003039802541024983, "learning_rate": 4.360000000000001e-06, "loss": 0.0058, "num_tokens": 107895764.0, "reward": 0.7962239980697632, "reward_std": 0.10681943595409393, "rewards/accuracy_reward": 0.6009114384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3388.0, "completions/mean_length": 615.0462646484375, "completions/mean_terminated_length": 608.2341918945312, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.1536, "grad_norm": 0.00027706267428584397, "learning_rate": 4.34e-06, "loss": 0.0027, "num_tokens": 109098619.0, "reward": 0.8626302480697632, "reward_std": 0.11808930337429047, "rewards/accuracy_reward": 0.7278645634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3248.0, "completions/mean_length": 603.88671875, "completions/mean_terminated_length": 587.8992309570312, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.1552, "grad_norm": 0.00034783451701514423, "learning_rate": 4.32e-06, "loss": 0.007, "num_tokens": 110267309.0, "reward": 0.8238932490348816, "reward_std": 0.13173702359199524, "rewards/accuracy_reward": 0.6536458134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3955.0, "completions/mean_length": 607.7220458984375, "completions/mean_terminated_length": 600.8956298828125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.1568, "grad_norm": 0.00031023091287352145, "learning_rate": 4.3e-06, "loss": 0.0006, "num_tokens": 111452194.0, "reward": 0.7848307490348816, "reward_std": 0.11247827112674713, "rewards/accuracy_reward": 0.572265625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3789.0, "completions/mean_length": 572.8756713867188, "completions/mean_terminated_length": 565.9810791015625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.1584, "grad_norm": 0.0003076742577832192, "learning_rate": 4.2800000000000005e-06, "loss": 0.0032, "num_tokens": 112584515.0, "reward": 0.8675130605697632, "reward_std": 0.10639947652816772, "rewards/accuracy_reward": 0.7376301884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4045.0, "completions/mean_length": 589.2448120117188, "completions/mean_terminated_length": 575.4927978515625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.16, "grad_norm": 0.00034919948666356504, "learning_rate": 4.26e-06, "loss": 0.0036, "num_tokens": 113733467.0, "reward": 0.8212890625, "reward_std": 0.09939996898174286, "rewards/accuracy_reward": 0.6477864384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.0, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3948.0, "completions/mean_length": 648.6927490234375, "completions/mean_terminated_length": 637.434326171875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.1616, "grad_norm": 0.0003104656934738159, "learning_rate": 4.24e-06, "loss": 0.0038, "num_tokens": 114987843.0, "reward": 0.8053385615348816, "reward_std": 0.11546418070793152, "rewards/accuracy_reward": 0.6145833134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 685.04296875, "completions/mean_terminated_length": 660.4393310546875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.1632, "grad_norm": 0.0004274912644177675, "learning_rate": 4.22e-06, "loss": 0.0112, "num_tokens": 116299173.0, "reward": 0.8138021230697632, "reward_std": 0.13215380907058716, "rewards/accuracy_reward": 0.63671875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3681.0, "completions/mean_length": 596.3600463867188, "completions/mean_terminated_length": 573.4266357421875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.1648, "grad_norm": 0.0003667453129310161, "learning_rate": 4.2000000000000004e-06, "loss": 0.0055, "num_tokens": 117457806.0, "reward": 0.7955729365348816, "reward_std": 0.1227598786354065, "rewards/accuracy_reward": 0.5983073115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3939.0, "completions/mean_length": 629.802734375, "completions/mean_terminated_length": 625.2835693359375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.1664, "grad_norm": 0.0003102488408330828, "learning_rate": 4.18e-06, "loss": 0.0006, "num_tokens": 118686367.0, "reward": 0.7991536855697632, "reward_std": 0.12297843396663666, "rewards/accuracy_reward": 0.6002604365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3969.0, "completions/mean_length": 624.1693115234375, "completions/mean_terminated_length": 610.5542602539062, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.168, "grad_norm": 0.0003095600113738328, "learning_rate": 4.16e-06, "loss": 0.0049, "num_tokens": 119890435.0, "reward": 0.8258463740348816, "reward_std": 0.1064920648932457, "rewards/accuracy_reward": 0.658203125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.0, "step": 105 }, { "epoch": 0.168, "eval_completions/clipped_ratio": 0.001953125, "eval_completions/max_length": 2841.625, "eval_completions/max_terminated_length": 2550.0, "eval_completions/mean_length": 634.35546875, "eval_completions/mean_terminated_length": 627.5686187744141, "eval_completions/min_length": 177.25, "eval_completions/min_terminated_length": 177.25, "eval_loss": 0.0, "eval_num_tokens": 119890435.0, "eval_reward": 0.8271484375, "eval_reward_std": 0.24152498692274094, "eval_rewards/accuracy_reward": 0.658203125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.99609375, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 185.3327, "eval_samples_per_second": 5.396, "eval_steps_per_second": 0.043, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3888.0, "completions/mean_length": 575.0169677734375, "completions/mean_terminated_length": 561.2091674804688, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.1696, "grad_norm": 0.00042155911796726286, "learning_rate": 4.14e-06, "loss": 0.006, "num_tokens": 121021469.0, "reward": 0.8636068105697632, "reward_std": 0.12580615282058716, "rewards/accuracy_reward": 0.7317708134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3745.0, "completions/mean_length": 621.05078125, "completions/mean_terminated_length": 611.9778442382812, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.1712, "grad_norm": 0.0003103128692600876, "learning_rate": 4.12e-06, "loss": 0.0062, "num_tokens": 122223403.0, "reward": 0.8522135615348816, "reward_std": 0.11826847493648529, "rewards/accuracy_reward": 0.70703125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 4009.0, "completions/mean_length": 562.3861083984375, "completions/mean_terminated_length": 550.8458251953125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.1728, "grad_norm": 0.00036560013541020453, "learning_rate": 4.1e-06, "loss": 0.0069, "num_tokens": 123328988.0, "reward": 0.8645833730697632, "reward_std": 0.08850304782390594, "rewards/accuracy_reward": 0.7337239384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.0, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2529.0, "completions/mean_length": 651.5208740234375, "completions/mean_terminated_length": 644.7802124023438, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.1744, "grad_norm": 0.00031642912654206157, "learning_rate": 4.08e-06, "loss": 0.0056, "num_tokens": 124583068.0, "reward": 0.8186849355697632, "reward_std": 0.1302374005317688, "rewards/accuracy_reward": 0.6393229365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4010.0, "completions/mean_length": 679.2682495117188, "completions/mean_terminated_length": 670.3472900390625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.176, "grad_norm": 0.0002773235028143972, "learning_rate": 4.060000000000001e-06, "loss": 0.0062, "num_tokens": 125877400.0, "reward": 0.8177083730697632, "reward_std": 0.11473320424556732, "rewards/accuracy_reward": 0.6393229365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3899.0, "completions/max_terminated_length": 3899.0, "completions/mean_length": 585.9577026367188, "completions/mean_terminated_length": 585.9577026367188, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.1776, "grad_norm": 0.0002980628050863743, "learning_rate": 4.04e-06, "loss": 0.0024, "num_tokens": 127029463.0, "reward": 0.8922526240348816, "reward_std": 0.09214043617248535, "rewards/accuracy_reward": 0.7858073115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3726.0, "completions/mean_length": 679.080078125, "completions/mean_terminated_length": 672.3933715820312, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.1792, "grad_norm": 0.00028468415257520974, "learning_rate": 4.0200000000000005e-06, "loss": 0.0055, "num_tokens": 128324498.0, "reward": 0.8658854365348816, "reward_std": 0.117245614528656, "rewards/accuracy_reward": 0.7350260615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3480.0, "completions/mean_length": 639.328125, "completions/mean_terminated_length": 634.8213500976562, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.1808, "grad_norm": 0.00022866151994094253, "learning_rate": 4.000000000000001e-06, "loss": 0.0036, "num_tokens": 129554410.0, "reward": 0.8219401240348816, "reward_std": 0.08891090005636215, "rewards/accuracy_reward": 0.6451823115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4041.0, "completions/mean_length": 641.0358276367188, "completions/mean_terminated_length": 638.7850341796875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.1824, "grad_norm": 0.0002721015189308673, "learning_rate": 3.980000000000001e-06, "loss": 0.0012, "num_tokens": 130790209.0, "reward": 0.7955729365348816, "reward_std": 0.0999690517783165, "rewards/accuracy_reward": 0.5924479365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 656.458984375, "completions/mean_terminated_length": 647.4784545898438, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.184, "grad_norm": 0.0002770905557554215, "learning_rate": 3.96e-06, "loss": 0.0017, "num_tokens": 132034018.0, "reward": 0.8606771230697632, "reward_std": 0.10591164231300354, "rewards/accuracy_reward": 0.7252604365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3730.0, "completions/mean_length": 757.0787963867188, "completions/mean_terminated_length": 750.544677734375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.1856, "grad_norm": 0.00027138355653733015, "learning_rate": 3.94e-06, "loss": 0.0039, "num_tokens": 133442523.0, "reward": 0.8385416865348816, "reward_std": 0.11794879287481308, "rewards/accuracy_reward": 0.6796875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4052.0, "completions/mean_length": 721.6412963867188, "completions/mean_terminated_length": 686.1217651367188, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.1872, "grad_norm": 0.00031462108017876744, "learning_rate": 3.920000000000001e-06, "loss": 0.0075, "num_tokens": 134799412.0, "reward": 0.732421875, "reward_std": 0.11114764213562012, "rewards/accuracy_reward": 0.4791666567325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9856770634651184, "rewards/mean_confidence_reward": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3508.0, "completions/mean_length": 645.5436401367188, "completions/mean_terminated_length": 641.0449829101562, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.1888, "grad_norm": 0.0002814891922753304, "learning_rate": 3.900000000000001e-06, "loss": 0.0045, "num_tokens": 136035543.0, "reward": 0.8470052480697632, "reward_std": 0.11385184526443481, "rewards/accuracy_reward": 0.6953125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 727.3991088867188, "completions/mean_terminated_length": 707.5448608398438, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.1904, "grad_norm": 0.000273804587777704, "learning_rate": 3.88e-06, "loss": 0.0067, "num_tokens": 137413308.0, "reward": 0.8216146230697632, "reward_std": 0.12531858682632446, "rewards/accuracy_reward": 0.65234375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3779.0, "completions/mean_length": 700.2213745117188, "completions/mean_terminated_length": 693.5759887695312, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.192, "grad_norm": 0.00031936613959260285, "learning_rate": 3.86e-06, "loss": 0.0051, "num_tokens": 138731280.0, "reward": 0.8479818105697632, "reward_std": 0.10980620235204697, "rewards/accuracy_reward": 0.69921875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 120 }, { "epoch": 0.192, "eval_completions/clipped_ratio": 0.00390625, "eval_completions/max_length": 3388.125, "eval_completions/max_terminated_length": 2737.75, "eval_completions/mean_length": 671.1189193725586, "eval_completions/mean_terminated_length": 657.5812530517578, "eval_completions/min_length": 183.0, "eval_completions/min_terminated_length": 183.0, "eval_loss": 0.0, "eval_num_tokens": 138731280.0, "eval_reward": 0.83837890625, "eval_reward_std": 0.24100100621581078, "eval_rewards/accuracy_reward": 0.6845703125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9921875, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 219.7323, "eval_samples_per_second": 4.551, "eval_steps_per_second": 0.036, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3137.0, "completions/mean_length": 641.6243896484375, "completions/mean_terminated_length": 639.3739013671875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.1936, "grad_norm": 0.00032513559563085437, "learning_rate": 3.8400000000000005e-06, "loss": 0.0048, "num_tokens": 139975279.0, "reward": 0.8479818105697632, "reward_std": 0.10212653875350952, "rewards/accuracy_reward": 0.697265625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2397.0, "completions/mean_length": 640.3561401367188, "completions/mean_terminated_length": 635.8507080078125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.1952, "grad_norm": 0.0004863200301770121, "learning_rate": 3.820000000000001e-06, "loss": 0.0036, "num_tokens": 141214962.0, "reward": 0.833984375, "reward_std": 0.12785589694976807, "rewards/accuracy_reward": 0.6692708134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3845.0, "completions/mean_length": 671.798828125, "completions/mean_terminated_length": 660.6159057617188, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.1968, "grad_norm": 0.00029012025333940983, "learning_rate": 3.8000000000000005e-06, "loss": 0.0051, "num_tokens": 142505949.0, "reward": 0.8785807490348816, "reward_std": 0.09667699038982391, "rewards/accuracy_reward": 0.7610676884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3412.0, "completions/mean_length": 681.5228271484375, "completions/mean_terminated_length": 674.8408203125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.1984, "grad_norm": 0.0003860195283778012, "learning_rate": 3.7800000000000002e-06, "loss": 0.0054, "num_tokens": 143800768.0, "reward": 0.8502604365348816, "reward_std": 0.11828397959470749, "rewards/accuracy_reward": 0.7063801884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 765.3346557617188, "completions/mean_terminated_length": 743.508544921875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.2, "grad_norm": 0.0003189561830367893, "learning_rate": 3.7600000000000004e-06, "loss": 0.0068, "num_tokens": 145211874.0, "reward": 0.8219401240348816, "reward_std": 0.14017345011234283, "rewards/accuracy_reward": 0.6536458134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4037.0, "completions/mean_length": 658.7806396484375, "completions/mean_terminated_length": 656.5413818359375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.2016, "grad_norm": 0.00029708523652516305, "learning_rate": 3.74e-06, "loss": 0.0022, "num_tokens": 146479889.0, "reward": 0.857421875, "reward_std": 0.09298097342252731, "rewards/accuracy_reward": 0.716796875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2081.0, "completions/mean_length": 739.6614990234375, "completions/mean_terminated_length": 728.7001953125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.2032, "grad_norm": 0.0002875027130357921, "learning_rate": 3.7200000000000004e-06, "loss": 0.0049, "num_tokens": 147870441.0, "reward": 0.8095703125, "reward_std": 0.11713831126689911, "rewards/accuracy_reward": 0.623046875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3633.0, "completions/mean_length": 651.76171875, "completions/mean_terminated_length": 635.993408203125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.2048, "grad_norm": 0.00027926714392378926, "learning_rate": 3.7e-06, "loss": 0.005, "num_tokens": 149114267.0, "reward": 0.8489583730697632, "reward_std": 0.11165182292461395, "rewards/accuracy_reward": 0.703125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3599.0, "completions/mean_length": 643.2767333984375, "completions/mean_terminated_length": 625.1995849609375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.2064, "grad_norm": 0.00027033602236770093, "learning_rate": 3.6800000000000003e-06, "loss": 0.0072, "num_tokens": 150348804.0, "reward": 0.828125, "reward_std": 0.11394263803958893, "rewards/accuracy_reward": 0.662109375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4028.0, "completions/max_terminated_length": 4028.0, "completions/mean_length": 703.6959838867188, "completions/mean_terminated_length": 703.6959838867188, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.208, "grad_norm": 0.0002899360260926187, "learning_rate": 3.66e-06, "loss": 0.0021, "num_tokens": 151691921.0, "reward": 0.8343099355697632, "reward_std": 0.11762022972106934, "rewards/accuracy_reward": 0.6692708134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2272.0, "completions/mean_length": 593.505859375, "completions/mean_terminated_length": 588.9393920898438, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.2096, "grad_norm": 0.0004012259014416486, "learning_rate": 3.6400000000000003e-06, "loss": 0.0022, "num_tokens": 152841786.0, "reward": 0.8232421875, "reward_std": 0.09440949559211731, "rewards/accuracy_reward": 0.6477864384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3897.0, "completions/mean_length": 648.228515625, "completions/mean_terminated_length": 639.2265014648438, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.2112, "grad_norm": 0.0002690345863811672, "learning_rate": 3.62e-06, "loss": 0.0043, "num_tokens": 154089305.0, "reward": 0.8466796875, "reward_std": 0.10462278127670288, "rewards/accuracy_reward": 0.6966145634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2916.0, "completions/mean_length": 688.78515625, "completions/mean_terminated_length": 679.8890380859375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2128, "grad_norm": 0.00028712491621263325, "learning_rate": 3.6000000000000003e-06, "loss": 0.0066, "num_tokens": 155403663.0, "reward": 0.8470052480697632, "reward_std": 0.11128474771976471, "rewards/accuracy_reward": 0.6966145634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3702.0, "completions/mean_length": 656.0651245117188, "completions/mean_terminated_length": 653.8240966796875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.2144, "grad_norm": 0.00025992398150265217, "learning_rate": 3.58e-06, "loss": 0.0022, "num_tokens": 156661139.0, "reward": 0.8492838740348816, "reward_std": 0.11946448683738708, "rewards/accuracy_reward": 0.7005208134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 690.2291870117188, "completions/mean_terminated_length": 679.1064453125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.216, "grad_norm": 0.00029352240380831063, "learning_rate": 3.5600000000000002e-06, "loss": 0.0044, "num_tokens": 157965619.0, "reward": 0.8310546875, "reward_std": 0.11421811580657959, "rewards/accuracy_reward": 0.666015625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 135 }, { "epoch": 0.216, "eval_completions/clipped_ratio": 0.004131610576923073, "eval_completions/max_length": 2533.625, "eval_completions/max_terminated_length": 2055.75, "eval_completions/mean_length": 674.3566741943359, "eval_completions/mean_terminated_length": 660.2391738891602, "eval_completions/min_length": 185.125, "eval_completions/min_terminated_length": 185.125, "eval_loss": 0.0, "eval_num_tokens": 157965619.0, "eval_reward": 0.83544921875, "eval_reward_std": 0.23921538889408112, "eval_rewards/accuracy_reward": 0.6748046875, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.99609375, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 165.3443, "eval_samples_per_second": 6.048, "eval_steps_per_second": 0.048, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3362.0, "completions/mean_length": 692.8607177734375, "completions/mean_terminated_length": 688.4237060546875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.2176, "grad_norm": 0.00022620441450271755, "learning_rate": 3.54e-06, "loss": 0.0033, "num_tokens": 159279485.0, "reward": 0.8323568105697632, "reward_std": 0.10510279983282089, "rewards/accuracy_reward": 0.666015625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2367.0, "completions/max_terminated_length": 2367.0, "completions/mean_length": 682.3587646484375, "completions/mean_terminated_length": 682.3587646484375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.2192, "grad_norm": 0.0002540338318794966, "learning_rate": 3.52e-06, "loss": 0.0013, "num_tokens": 160586436.0, "reward": 0.8365885615348816, "reward_std": 0.0832124650478363, "rewards/accuracy_reward": 0.6731770634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4040.0, "completions/mean_length": 646.9765625, "completions/mean_terminated_length": 644.7296142578125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.2208, "grad_norm": 0.00026369214174337685, "learning_rate": 3.5e-06, "loss": 0.0026, "num_tokens": 161825440.0, "reward": 0.8030599355697632, "reward_std": 0.09876357764005661, "rewards/accuracy_reward": 0.6087239384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4044.0, "completions/mean_length": 630.865234375, "completions/mean_terminated_length": 624.0841674804688, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.2224, "grad_norm": 0.0002618631988298148, "learning_rate": 3.48e-06, "loss": 0.0039, "num_tokens": 163042641.0, "reward": 0.8447265625, "reward_std": 0.12326391041278839, "rewards/accuracy_reward": 0.6940104365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2398.0, "completions/mean_length": 667.21875, "completions/mean_terminated_length": 664.9849853515625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.224, "grad_norm": 0.00022694087238050997, "learning_rate": 3.46e-06, "loss": 0.0028, "num_tokens": 164326273.0, "reward": 0.8720703125, "reward_std": 0.0972733423113823, "rewards/accuracy_reward": 0.7447916865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3948.0, "completions/mean_length": 690.6068115234375, "completions/mean_terminated_length": 688.3882446289062, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.2256, "grad_norm": 0.00024979733279906213, "learning_rate": 3.44e-06, "loss": 0.0025, "num_tokens": 165645989.0, "reward": 0.833984375, "reward_std": 0.10551907867193222, "rewards/accuracy_reward": 0.669921875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 674.322265625, "completions/mean_terminated_length": 658.6572875976562, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.2272, "grad_norm": 0.00022735250240657479, "learning_rate": 3.4200000000000007e-06, "loss": 0.0045, "num_tokens": 166926036.0, "reward": 0.8603515625, "reward_std": 0.09379260241985321, "rewards/accuracy_reward": 0.7252604365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3327.0, "completions/mean_length": 707.3971557617188, "completions/mean_terminated_length": 705.1895751953125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.2288, "grad_norm": 0.00022118983906693757, "learning_rate": 3.4000000000000005e-06, "loss": 0.002, "num_tokens": 168279094.0, "reward": 0.8131510615348816, "reward_std": 0.08558207750320435, "rewards/accuracy_reward": 0.6276041865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3626.0, "completions/mean_length": 688.408203125, "completions/mean_terminated_length": 679.5111083984375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2304, "grad_norm": 0.0003067946818191558, "learning_rate": 3.3800000000000007e-06, "loss": 0.004, "num_tokens": 169585161.0, "reward": 0.8590494990348816, "reward_std": 0.11964093148708344, "rewards/accuracy_reward": 0.7213541865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4042.0, "completions/mean_length": 726.1979370117188, "completions/mean_terminated_length": 692.9652099609375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.232, "grad_norm": 0.00024732324527576566, "learning_rate": 3.3600000000000004e-06, "loss": 0.0054, "num_tokens": 170962297.0, "reward": 0.828125, "reward_std": 0.10565119981765747, "rewards/accuracy_reward": 0.6692708134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9869791865348816, "rewards/mean_confidence_reward": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 640.162109375, "completions/mean_terminated_length": 635.6564331054688, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.2336, "grad_norm": 0.0002848675358109176, "learning_rate": 3.3400000000000006e-06, "loss": 0.0026, "num_tokens": 172190514.0, "reward": 0.8330078125, "reward_std": 0.10714669525623322, "rewards/accuracy_reward": 0.6686198115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4000.0, "completions/mean_length": 792.001953125, "completions/mean_terminated_length": 789.8494873046875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.2352, "grad_norm": 0.000232724953093566, "learning_rate": 3.3200000000000004e-06, "loss": 0.0024, "num_tokens": 173648949.0, "reward": 0.8505859375, "reward_std": 0.10723203420639038, "rewards/accuracy_reward": 0.7024739384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 722.1569213867188, "completions/mean_terminated_length": 719.9589233398438, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.2368, "grad_norm": 0.00026162792346440256, "learning_rate": 3.3000000000000006e-06, "loss": 0.0007, "num_tokens": 175012998.0, "reward": 0.875, "reward_std": 0.12952539324760437, "rewards/accuracy_reward": 0.7506510615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3417.0, "completions/mean_length": 682.4375, "completions/mean_terminated_length": 680.2136840820312, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.2384, "grad_norm": 0.00024456536630168557, "learning_rate": 3.2800000000000004e-06, "loss": 0.0016, "num_tokens": 176310086.0, "reward": 0.8033854365348816, "reward_std": 0.11376194655895233, "rewards/accuracy_reward": 0.6080729365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3912.0, "completions/mean_length": 746.6771240234375, "completions/mean_terminated_length": 731.3433227539062, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.24, "grad_norm": 0.0001843287463998422, "learning_rate": 3.2600000000000006e-06, "loss": 0.0051, "num_tokens": 177720246.0, "reward": 0.8271484375, "reward_std": 0.09509900212287903, "rewards/accuracy_reward": 0.6627604365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.0, "step": 150 }, { "epoch": 0.24, "eval_completions/clipped_ratio": 0.001953125, "eval_completions/max_length": 2658.75, "eval_completions/max_terminated_length": 2294.25, "eval_completions/mean_length": 692.9423828125, "eval_completions/mean_terminated_length": 686.3740158081055, "eval_completions/min_length": 202.625, "eval_completions/min_terminated_length": 202.625, "eval_loss": 0.0, "eval_num_tokens": 177720246.0, "eval_reward": 0.84228515625, "eval_reward_std": 0.23344828002154827, "eval_rewards/accuracy_reward": 0.6865234375, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.998046875, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 174.1005, "eval_samples_per_second": 5.744, "eval_steps_per_second": 0.046, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 678.060546875, "completions/mean_terminated_length": 669.1364135742188, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.2416, "grad_norm": 0.0002267613890580833, "learning_rate": 3.2400000000000003e-06, "loss": 0.0031, "num_tokens": 179009171.0, "reward": 0.8408203125, "reward_std": 0.09594687074422836, "rewards/accuracy_reward": 0.6875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2118.0, "completions/max_terminated_length": 2118.0, "completions/mean_length": 699.1888427734375, "completions/mean_terminated_length": 699.1888427734375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.2432, "grad_norm": 0.00020184057939331979, "learning_rate": 3.2200000000000005e-06, "loss": 0.0021, "num_tokens": 180334869.0, "reward": 0.8873698115348816, "reward_std": 0.08920140564441681, "rewards/accuracy_reward": 0.7747395634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3976.0, "completions/mean_length": 694.2122802734375, "completions/mean_terminated_length": 689.7770385742188, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.2448, "grad_norm": 0.00022934444132260978, "learning_rate": 3.2000000000000003e-06, "loss": 0.0026, "num_tokens": 181649275.0, "reward": 0.8678385615348816, "reward_std": 0.09251650422811508, "rewards/accuracy_reward": 0.7376301884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3360.0, "completions/mean_length": 665.09765625, "completions/mean_terminated_length": 651.6431274414062, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.2464, "grad_norm": 0.00022664369316771626, "learning_rate": 3.1800000000000005e-06, "loss": 0.0043, "num_tokens": 182915281.0, "reward": 0.8639323115348816, "reward_std": 0.09880559891462326, "rewards/accuracy_reward": 0.732421875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3948.0, "completions/mean_length": 705.654296875, "completions/mean_terminated_length": 699.0195922851562, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.248, "grad_norm": 0.00023467942082788795, "learning_rate": 3.1600000000000002e-06, "loss": 0.0026, "num_tokens": 184251294.0, "reward": 0.8245443105697632, "reward_std": 0.12011813372373581, "rewards/accuracy_reward": 0.6529948115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3784.0, "completions/mean_length": 825.3541870117188, "completions/mean_terminated_length": 793.0993041992188, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.2496, "grad_norm": 0.00017507070151623338, "learning_rate": 3.1400000000000004e-06, "loss": 0.0078, "num_tokens": 185772542.0, "reward": 0.8011068105697632, "reward_std": 0.10450413823127747, "rewards/accuracy_reward": 0.61328125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9889323115348816, "rewards/mean_confidence_reward": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3187.0, "completions/mean_length": 723.919921875, "completions/mean_terminated_length": 721.72314453125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.2512, "grad_norm": 0.00023560119734611362, "learning_rate": 3.12e-06, "loss": 0.001, "num_tokens": 187142339.0, "reward": 0.7955729365348816, "reward_std": 0.11111621558666229, "rewards/accuracy_reward": 0.591796875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3989.0, "completions/mean_length": 657.2239990234375, "completions/mean_terminated_length": 652.7405395507812, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.2528, "grad_norm": 0.0003069440135732293, "learning_rate": 3.1000000000000004e-06, "loss": 0.0051, "num_tokens": 188401307.0, "reward": 0.8232421875, "reward_std": 0.118209108710289, "rewards/accuracy_reward": 0.6516926884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3811.0, "completions/mean_length": 741.197265625, "completions/mean_terminated_length": 734.632080078125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.2544, "grad_norm": 0.00019721277931239456, "learning_rate": 3.08e-06, "loss": 0.0008, "num_tokens": 189795722.0, "reward": 0.8668619990348816, "reward_std": 0.0974317193031311, "rewards/accuracy_reward": 0.7369791865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3850.0, "completions/mean_length": 786.9088745117188, "completions/mean_terminated_length": 780.4331665039062, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.256, "grad_norm": 0.00018737994832918048, "learning_rate": 3.0600000000000003e-06, "loss": 0.005, "num_tokens": 191259326.0, "reward": 0.8483073115348816, "reward_std": 0.10372404754161835, "rewards/accuracy_reward": 0.69921875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3950.0, "completions/mean_length": 731.23828125, "completions/mean_terminated_length": 726.8513793945312, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.2576, "grad_norm": 0.0002682065241970122, "learning_rate": 3.04e-06, "loss": 0.0031, "num_tokens": 192637836.0, "reward": 0.8483073115348816, "reward_std": 0.10574528574943542, "rewards/accuracy_reward": 0.6998698115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2180.0, "completions/mean_length": 743.5703125, "completions/mean_terminated_length": 737.0098266601562, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.2592, "grad_norm": 0.00024031188513617963, "learning_rate": 3.0200000000000003e-06, "loss": 0.0047, "num_tokens": 194028376.0, "reward": 0.8688151240348816, "reward_std": 0.11178970336914062, "rewards/accuracy_reward": 0.7395833134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3733.0, "completions/mean_length": 746.8372802734375, "completions/mean_terminated_length": 733.7033081054688, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.2608, "grad_norm": 0.00020617041445802897, "learning_rate": 3e-06, "loss": 0.0047, "num_tokens": 195421598.0, "reward": 0.8460286855697632, "reward_std": 0.10628112405538559, "rewards/accuracy_reward": 0.6966145634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3142.0, "completions/mean_length": 728.02734375, "completions/mean_terminated_length": 714.8196411132812, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.2624, "grad_norm": 0.00024746221606619656, "learning_rate": 2.9800000000000003e-06, "loss": 0.0039, "num_tokens": 196797160.0, "reward": 0.8232421875, "reward_std": 0.13011503219604492, "rewards/accuracy_reward": 0.6510416865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.0, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3847.0, "completions/mean_length": 679.6712646484375, "completions/mean_terminated_length": 677.4456176757812, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.264, "grad_norm": 0.00021511182421818376, "learning_rate": 2.96e-06, "loss": 0.0015, "num_tokens": 198107663.0, "reward": 0.8404948115348816, "reward_std": 0.10079500079154968, "rewards/accuracy_reward": 0.6829426884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 165 }, { "epoch": 0.264, "eval_completions/clipped_ratio": 0.0029296875, "eval_completions/max_length": 3122.5, "eval_completions/max_terminated_length": 2324.5, "eval_completions/mean_length": 717.6944122314453, "eval_completions/mean_terminated_length": 707.7649917602539, "eval_completions/min_length": 215.0, "eval_completions/min_terminated_length": 215.0, "eval_loss": 0.0, "eval_num_tokens": 198107663.0, "eval_reward": 0.8427734375, "eval_reward_std": 0.2350256573408842, "eval_rewards/accuracy_reward": 0.689453125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.99609375, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 202.7803, "eval_samples_per_second": 4.931, "eval_steps_per_second": 0.039, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 4013.0, "completions/mean_length": 681.890625, "completions/mean_terminated_length": 677.4393310546875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.2656, "grad_norm": 0.00023226081975735724, "learning_rate": 2.9400000000000002e-06, "loss": 0.0025, "num_tokens": 199401959.0, "reward": 0.8033854365348816, "reward_std": 0.10436809062957764, "rewards/accuracy_reward": 0.609375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 701.330078125, "completions/mean_terminated_length": 685.7886962890625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.2672, "grad_norm": 0.00022016193543095142, "learning_rate": 2.92e-06, "loss": 0.0072, "num_tokens": 200721474.0, "reward": 0.8766276240348816, "reward_std": 0.10607515275478363, "rewards/accuracy_reward": 0.7610676884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4006.0, "completions/mean_length": 771.2174682617188, "completions/mean_terminated_length": 740.6346435546875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.2688, "grad_norm": 0.00024248022236861289, "learning_rate": 2.9e-06, "loss": 0.0003, "num_tokens": 202156144.0, "reward": 0.7962239980697632, "reward_std": 0.10773807764053345, "rewards/accuracy_reward": 0.603515625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9889323115348816, "rewards/mean_confidence_reward": 0.0, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 763.1491088867188, "completions/mean_terminated_length": 747.8907470703125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.2704, "grad_norm": 0.0002491815248504281, "learning_rate": 2.88e-06, "loss": 0.0057, "num_tokens": 203589909.0, "reward": 0.7952474355697632, "reward_std": 0.13419264554977417, "rewards/accuracy_reward": 0.59765625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 792.873046875, "completions/mean_terminated_length": 784.2487182617188, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.272, "grad_norm": 0.0002829180157277733, "learning_rate": 2.86e-06, "loss": 0.0073, "num_tokens": 205065522.0, "reward": 0.8427734375, "reward_std": 0.1227998360991478, "rewards/accuracy_reward": 0.6901041865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2574.0, "completions/mean_length": 698.5247802734375, "completions/mean_terminated_length": 689.654052734375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.2736, "grad_norm": 0.0002195298729930073, "learning_rate": 2.84e-06, "loss": 0.0052, "num_tokens": 206378680.0, "reward": 0.875, "reward_std": 0.11634199321269989, "rewards/accuracy_reward": 0.7526041865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2407.0, "completions/mean_length": 704.4544677734375, "completions/mean_terminated_length": 697.8173828125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.2752, "grad_norm": 0.00024867590400390327, "learning_rate": 2.82e-06, "loss": 0.001, "num_tokens": 207714610.0, "reward": 0.8232421875, "reward_std": 0.12597505748271942, "rewards/accuracy_reward": 0.6484375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3451.0, "completions/mean_length": 749.7526245117188, "completions/mean_terminated_length": 741.0156860351562, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.2768, "grad_norm": 0.00021386242588050663, "learning_rate": 2.8000000000000003e-06, "loss": 0.0005, "num_tokens": 209126454.0, "reward": 0.83203125, "reward_std": 0.10877606272697449, "rewards/accuracy_reward": 0.6673176884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3706.0, "completions/mean_length": 654.51171875, "completions/mean_terminated_length": 641.0156860351562, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.2784, "grad_norm": 0.00024265458341687918, "learning_rate": 2.7800000000000005e-06, "loss": 0.005, "num_tokens": 210380072.0, "reward": 0.8961588740348816, "reward_std": 0.11534937471151352, "rewards/accuracy_reward": 0.7962239384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3614.0, "completions/mean_length": 719.1009521484375, "completions/mean_terminated_length": 712.4924926757812, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.28, "grad_norm": 0.00021166351507417858, "learning_rate": 2.7600000000000003e-06, "loss": 0.003, "num_tokens": 211733155.0, "reward": 0.857421875, "reward_std": 0.10154535621404648, "rewards/accuracy_reward": 0.7174479365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 697.1744995117188, "completions/mean_terminated_length": 688.30029296875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.2816, "grad_norm": 0.00024020136334002018, "learning_rate": 2.7400000000000004e-06, "loss": 0.0027, "num_tokens": 213057807.0, "reward": 0.8349609375, "reward_std": 0.10506369918584824, "rewards/accuracy_reward": 0.673828125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3795.0, "completions/mean_length": 766.68359375, "completions/mean_terminated_length": 764.5146484375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.2832, "grad_norm": 0.0001792066905181855, "learning_rate": 2.7200000000000002e-06, "loss": 0.0031, "num_tokens": 214488905.0, "reward": 0.8199869990348816, "reward_std": 0.09937269985675812, "rewards/accuracy_reward": 0.6412760615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 708.5299682617188, "completions/mean_terminated_length": 706.3231201171875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.2848, "grad_norm": 0.00024319469230249524, "learning_rate": 2.7000000000000004e-06, "loss": 0.0035, "num_tokens": 215829111.0, "reward": 0.8414713740348816, "reward_std": 0.11838024109601974, "rewards/accuracy_reward": 0.68359375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2732.0, "completions/max_terminated_length": 2732.0, "completions/mean_length": 666.25390625, "completions/mean_terminated_length": 666.25390625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.2864, "grad_norm": 0.00021719449432566762, "learning_rate": 2.68e-06, "loss": 0.0001, "num_tokens": 217094973.0, "reward": 0.8151041865348816, "reward_std": 0.09207914769649506, "rewards/accuracy_reward": 0.6302083134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3695.0, "completions/mean_length": 802.951171875, "completions/mean_terminated_length": 785.7100830078125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.288, "grad_norm": 0.0001964816910913214, "learning_rate": 2.6600000000000004e-06, "loss": 0.0044, "num_tokens": 218581778.0, "reward": 0.8551432490348816, "reward_std": 0.11192312836647034, "rewards/accuracy_reward": 0.7161458134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 180 }, { "epoch": 0.288, "eval_completions/clipped_ratio": 0.0048828125, "eval_completions/max_length": 3209.25, "eval_completions/max_terminated_length": 2207.75, "eval_completions/mean_length": 741.767204284668, "eval_completions/mean_terminated_length": 725.3175811767578, "eval_completions/min_length": 215.5, "eval_completions/min_terminated_length": 215.5, "eval_loss": 0.0, "eval_num_tokens": 218581778.0, "eval_reward": 0.84228515625, "eval_reward_std": 0.23687533661723137, "eval_rewards/accuracy_reward": 0.689453125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9951171875, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 208.8265, "eval_samples_per_second": 4.789, "eval_steps_per_second": 0.038, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2397.0, "completions/mean_length": 738.3932495117188, "completions/mean_terminated_length": 725.2261352539062, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.2896, "grad_norm": 0.00022324849851429462, "learning_rate": 2.64e-06, "loss": 0.0032, "num_tokens": 219968782.0, "reward": 0.8512369990348816, "reward_std": 0.11554962396621704, "rewards/accuracy_reward": 0.7063801884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 714.5989990234375, "completions/mean_terminated_length": 694.6693115234375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.2912, "grad_norm": 0.00022569728025700897, "learning_rate": 2.6200000000000003e-06, "loss": 0.0058, "num_tokens": 221325702.0, "reward": 0.8284505605697632, "reward_std": 0.11835271120071411, "rewards/accuracy_reward": 0.6627604365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 647.41796875, "completions/mean_terminated_length": 647.41796875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.2928, "grad_norm": 0.0002429847518214956, "learning_rate": 2.6e-06, "loss": 0.0018, "num_tokens": 222582888.0, "reward": 0.8623046875, "reward_std": 0.1010977178812027, "rewards/accuracy_reward": 0.724609375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2548.0, "completions/mean_length": 696.9290771484375, "completions/mean_terminated_length": 692.4973754882812, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.2944, "grad_norm": 0.00019927756511606276, "learning_rate": 2.5800000000000003e-06, "loss": 0.0021, "num_tokens": 223909307.0, "reward": 0.8343099355697632, "reward_std": 0.1007145494222641, "rewards/accuracy_reward": 0.669921875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2819.0, "completions/mean_length": 686.349609375, "completions/mean_terminated_length": 684.1283569335938, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.296, "grad_norm": 0.00022834978881292045, "learning_rate": 2.56e-06, "loss": 0.0018, "num_tokens": 225221076.0, "reward": 0.87109375, "reward_std": 0.09891021251678467, "rewards/accuracy_reward": 0.7428385615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3286.0, "completions/mean_length": 767.5482177734375, "completions/mean_terminated_length": 761.0346069335938, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.2976, "grad_norm": 0.00028291091439314187, "learning_rate": 2.5400000000000002e-06, "loss": 0.0024, "num_tokens": 226654686.0, "reward": 0.8206380605697632, "reward_std": 0.1270560473203659, "rewards/accuracy_reward": 0.6432291865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3971.0, "completions/mean_length": 754.7702026367188, "completions/mean_terminated_length": 735.0772705078125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.2992, "grad_norm": 0.00022468093084171414, "learning_rate": 2.52e-06, "loss": 0.0059, "num_tokens": 228074205.0, "reward": 0.8369140625, "reward_std": 0.11414369195699692, "rewards/accuracy_reward": 0.6822916865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.0, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3986.0, "completions/mean_length": 800.7728271484375, "completions/mean_terminated_length": 777.00390625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.3008, "grad_norm": 0.00021207747340667993, "learning_rate": 2.5e-06, "loss": 0.0037, "num_tokens": 229547616.0, "reward": 0.8258463740348816, "reward_std": 0.1256496012210846, "rewards/accuracy_reward": 0.66015625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 2821.0, "completions/mean_length": 710.4173583984375, "completions/mean_terminated_length": 692.6917724609375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.3024, "grad_norm": 0.00023993889044504613, "learning_rate": 2.4800000000000004e-06, "loss": 0.0028, "num_tokens": 230892225.0, "reward": 0.8626302480697632, "reward_std": 0.13042403757572174, "rewards/accuracy_reward": 0.73046875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3646.0, "completions/mean_length": 784.970703125, "completions/mean_terminated_length": 776.3257446289062, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.304, "grad_norm": 0.00032675074180588126, "learning_rate": 2.46e-06, "loss": 0.0057, "num_tokens": 232350836.0, "reward": 0.8235677480697632, "reward_std": 0.1223805695772171, "rewards/accuracy_reward": 0.650390625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3986.0, "completions/mean_length": 806.1849365234375, "completions/mean_terminated_length": 788.9607543945312, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.3056, "grad_norm": 0.00022698614338878542, "learning_rate": 2.4400000000000004e-06, "loss": 0.0033, "num_tokens": 233858320.0, "reward": 0.82421875, "reward_std": 0.10769402235746384, "rewards/accuracy_reward": 0.6536458134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2472.0, "completions/mean_length": 777.1204833984375, "completions/mean_terminated_length": 774.9583129882812, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.3072, "grad_norm": 0.00020551096531562507, "learning_rate": 2.42e-06, "loss": -0.0001, "num_tokens": 235312969.0, "reward": 0.810546875, "reward_std": 0.09532786905765533, "rewards/accuracy_reward": 0.6217448115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1813.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 670.65234375, "completions/mean_terminated_length": 670.65234375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.3088, "grad_norm": 0.00022991349396761507, "learning_rate": 2.4000000000000003e-06, "loss": 0.0027, "num_tokens": 236601555.0, "reward": 0.8870443105697632, "reward_std": 0.08219411224126816, "rewards/accuracy_reward": 0.7740885615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2077.0, "completions/mean_length": 695.2780151367188, "completions/mean_terminated_length": 693.0625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.3104, "grad_norm": 0.00025679238024167717, "learning_rate": 2.38e-06, "loss": 0.0003, "num_tokens": 237922526.0, "reward": 0.8538411855697632, "reward_std": 0.10394446551799774, "rewards/accuracy_reward": 0.7083333134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2897.0, "completions/mean_length": 701.9459838867188, "completions/mean_terminated_length": 697.5208740234375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.312, "grad_norm": 0.0002799215435516089, "learning_rate": 2.3600000000000003e-06, "loss": 0.0023, "num_tokens": 239252427.0, "reward": 0.8362630605697632, "reward_std": 0.11378495395183563, "rewards/accuracy_reward": 0.673828125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 195 }, { "epoch": 0.312, "eval_completions/clipped_ratio": 0.004131610576923073, "eval_completions/max_length": 3031.875, "eval_completions/max_terminated_length": 1979.5, "eval_completions/mean_length": 720.7300186157227, "eval_completions/mean_terminated_length": 706.7345962524414, "eval_completions/min_length": 208.625, "eval_completions/min_terminated_length": 208.625, "eval_loss": 0.0, "eval_num_tokens": 239252427.0, "eval_reward": 0.837890625, "eval_reward_std": 0.2377728447318077, "eval_rewards/accuracy_reward": 0.6806640625, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9951171875, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 198.0807, "eval_samples_per_second": 5.048, "eval_steps_per_second": 0.04, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 755.951171875, "completions/mean_terminated_length": 751.596435546875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.3136, "grad_norm": 0.00020399436471052468, "learning_rate": 2.3400000000000005e-06, "loss": 0.0022, "num_tokens": 240674528.0, "reward": 0.8395182490348816, "reward_std": 0.09909713268280029, "rewards/accuracy_reward": 0.6803385615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3750.0, "completions/mean_length": 632.9739990234375, "completions/mean_terminated_length": 628.4589233398438, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.3152, "grad_norm": 0.0002569823991507292, "learning_rate": 2.3200000000000002e-06, "loss": 0.0013, "num_tokens": 241897048.0, "reward": 0.8251953125, "reward_std": 0.11262068152427673, "rewards/accuracy_reward": 0.6516926884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3682.0, "completions/max_terminated_length": 3682.0, "completions/mean_length": 723.0182495117188, "completions/mean_terminated_length": 723.0182495117188, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3168, "grad_norm": 0.00025039640604518354, "learning_rate": 2.3000000000000004e-06, "loss": 0.0028, "num_tokens": 243249972.0, "reward": 0.845703125, "reward_std": 0.12428542226552963, "rewards/accuracy_reward": 0.6920573115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3403.0, "completions/mean_length": 767.6022338867188, "completions/mean_terminated_length": 763.2626953125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.3184, "grad_norm": 0.00019613925542216748, "learning_rate": 2.28e-06, "loss": 0.0033, "num_tokens": 244687249.0, "reward": 0.8229166865348816, "reward_std": 0.1075209528207779, "rewards/accuracy_reward": 0.6471354365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2184.0, "completions/mean_length": 655.0631713867188, "completions/mean_terminated_length": 652.8214721679688, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.32, "grad_norm": 0.0002827704302035272, "learning_rate": 2.2600000000000004e-06, "loss": 0.0014, "num_tokens": 245948018.0, "reward": 0.8678385615348816, "reward_std": 0.10062810033559799, "rewards/accuracy_reward": 0.736328125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3958.0, "completions/mean_length": 719.197265625, "completions/mean_terminated_length": 703.7377319335938, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.3216, "grad_norm": 0.00022865460778120905, "learning_rate": 2.24e-06, "loss": 0.005, "num_tokens": 247314817.0, "reward": 0.8541666865348816, "reward_std": 0.1042466014623642, "rewards/accuracy_reward": 0.71484375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3939.0, "completions/mean_length": 654.103515625, "completions/mean_terminated_length": 647.367919921875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.3232, "grad_norm": 0.0002972015645354986, "learning_rate": 2.2200000000000003e-06, "loss": 0.0043, "num_tokens": 248564896.0, "reward": 0.8072916865348816, "reward_std": 0.11283939331769943, "rewards/accuracy_reward": 0.6178385615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2337.0, "completions/mean_length": 669.0794677734375, "completions/mean_terminated_length": 662.3731689453125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.3248, "grad_norm": 0.0003115416329819709, "learning_rate": 2.2e-06, "loss": 0.0031, "num_tokens": 249841370.0, "reward": 0.8330078125, "reward_std": 0.10496409982442856, "rewards/accuracy_reward": 0.66796875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2023.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 656.328125, "completions/mean_terminated_length": 656.328125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.3264, "grad_norm": 0.00026321772020310163, "learning_rate": 2.1800000000000003e-06, "loss": 0.0001, "num_tokens": 251091026.0, "reward": 0.8844401240348816, "reward_std": 0.1178918182849884, "rewards/accuracy_reward": 0.7688801884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 667.822265625, "completions/mean_terminated_length": 665.5889282226562, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.328, "grad_norm": 0.00023584705195389688, "learning_rate": 2.16e-06, "loss": 0.001, "num_tokens": 252368609.0, "reward": 0.8753255605697632, "reward_std": 0.07695117592811584, "rewards/accuracy_reward": 0.7513020634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3653.0, "completions/mean_length": 705.68359375, "completions/mean_terminated_length": 699.0489501953125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.3296, "grad_norm": 0.00022140606597531587, "learning_rate": 2.1400000000000003e-06, "loss": 0.0022, "num_tokens": 253714811.0, "reward": 0.8414713740348816, "reward_std": 0.10839053243398666, "rewards/accuracy_reward": 0.685546875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3847.0, "completions/mean_length": 667.0807495117188, "completions/mean_terminated_length": 660.3705444335938, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.3312, "grad_norm": 0.00024301458324771374, "learning_rate": 2.12e-06, "loss": 0.0008, "num_tokens": 255003575.0, "reward": 0.8203125, "reward_std": 0.10410245507955551, "rewards/accuracy_reward": 0.6438801884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 635.0306396484375, "completions/mean_terminated_length": 619.1857299804688, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.3328, "grad_norm": 0.00020010562730021775, "learning_rate": 2.1000000000000002e-06, "loss": 0.0038, "num_tokens": 256227398.0, "reward": 0.83984375, "reward_std": 0.09344401955604553, "rewards/accuracy_reward": 0.6842448115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3611.0, "completions/mean_length": 726.158203125, "completions/mean_terminated_length": 717.3596801757812, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.3344, "grad_norm": 0.00023134164803195745, "learning_rate": 2.08e-06, "loss": 0.0026, "num_tokens": 257593529.0, "reward": 0.8046875, "reward_std": 0.11423548310995102, "rewards/accuracy_reward": 0.6126301884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2155.0, "completions/mean_length": 715.986328125, "completions/mean_terminated_length": 713.7843627929688, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.336, "grad_norm": 0.0002050853072432801, "learning_rate": 2.06e-06, "loss": 0.001, "num_tokens": 258952772.0, "reward": 0.82421875, "reward_std": 0.08276193588972092, "rewards/accuracy_reward": 0.6490885615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 210 }, { "epoch": 0.336, "eval_completions/clipped_ratio": 0.0009765625, "eval_completions/max_length": 2296.125, "eval_completions/max_terminated_length": 2104.875, "eval_completions/mean_length": 691.8269271850586, "eval_completions/mean_terminated_length": 688.5187606811523, "eval_completions/min_length": 214.75, "eval_completions/min_terminated_length": 214.75, "eval_loss": 0.0, "eval_num_tokens": 258952772.0, "eval_reward": 0.845703125, "eval_reward_std": 0.23166663758456707, "eval_rewards/accuracy_reward": 0.6923828125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9990234375, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 153.7523, "eval_samples_per_second": 6.504, "eval_steps_per_second": 0.052, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2480.0, "completions/mean_length": 694.30859375, "completions/mean_terminated_length": 692.0924682617188, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.3376, "grad_norm": 0.00019909637921955436, "learning_rate": 2.04e-06, "loss": 0.0036, "num_tokens": 260260958.0, "reward": 0.8688151240348816, "reward_std": 0.09698426723480225, "rewards/accuracy_reward": 0.73828125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2565.0, "completions/mean_length": 659.4798583984375, "completions/mean_terminated_length": 650.5072021484375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.3392, "grad_norm": 0.0002826956333592534, "learning_rate": 2.02e-06, "loss": 0.0047, "num_tokens": 261516895.0, "reward": 0.8955078125, "reward_std": 0.10796459764242172, "rewards/accuracy_reward": 0.7936198115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2352.0, "completions/mean_length": 669.6178588867188, "completions/mean_terminated_length": 660.6716918945312, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.3408, "grad_norm": 0.0002060349506791681, "learning_rate": 2.0000000000000003e-06, "loss": 0.0007, "num_tokens": 262799796.0, "reward": 0.8401693105697632, "reward_std": 0.0928923711180687, "rewards/accuracy_reward": 0.6829426884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 1507.0, "completions/mean_length": 584.1947021484375, "completions/mean_terminated_length": 579.6160278320312, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.3424, "grad_norm": 0.00021981749159749597, "learning_rate": 1.98e-06, "loss": 0.0025, "num_tokens": 263936127.0, "reward": 0.8811849355697632, "reward_std": 0.07451210916042328, "rewards/accuracy_reward": 0.763671875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2773.0, "completions/mean_length": 706.6868896484375, "completions/mean_terminated_length": 700.0541381835938, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.344, "grad_norm": 0.0002008735027629882, "learning_rate": 1.9600000000000003e-06, "loss": 0.0041, "num_tokens": 265277182.0, "reward": 0.865234375, "reward_std": 0.09812372922897339, "rewards/accuracy_reward": 0.732421875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3352.0, "completions/mean_length": 739.2025146484375, "completions/mean_terminated_length": 734.825927734375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.3456, "grad_norm": 0.00019547375268302858, "learning_rate": 1.94e-06, "loss": 0.0018, "num_tokens": 266667605.0, "reward": 0.8697916865348816, "reward_std": 0.0944841131567955, "rewards/accuracy_reward": 0.7408854365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3203.0, "completions/mean_length": 689.9674682617188, "completions/mean_terminated_length": 681.0744018554688, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.3472, "grad_norm": 0.00019665432046167552, "learning_rate": 1.9200000000000003e-06, "loss": 0.0026, "num_tokens": 267975715.0, "reward": 0.8356119990348816, "reward_std": 0.09117422997951508, "rewards/accuracy_reward": 0.673828125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3805.0, "completions/mean_length": 706.6771240234375, "completions/mean_terminated_length": 702.2581176757812, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.3488, "grad_norm": 0.00021449057385325432, "learning_rate": 1.9000000000000002e-06, "loss": 0.0027, "num_tokens": 269326483.0, "reward": 0.8043619990348816, "reward_std": 0.1009722501039505, "rewards/accuracy_reward": 0.6106770634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3771.0, "completions/mean_length": 769.849609375, "completions/mean_terminated_length": 761.1651611328125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.3504, "grad_norm": 0.0001826629159040749, "learning_rate": 1.8800000000000002e-06, "loss": 0.0049, "num_tokens": 270758476.0, "reward": 0.8310546875, "reward_std": 0.09529079496860504, "rewards/accuracy_reward": 0.666015625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 1951.0, "completions/mean_length": 723.4095458984375, "completions/mean_terminated_length": 712.3951416015625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.352, "grad_norm": 0.0001924209063872695, "learning_rate": 1.8600000000000002e-06, "loss": 0.0041, "num_tokens": 272124289.0, "reward": 0.8701171875, "reward_std": 0.07167939841747284, "rewards/accuracy_reward": 0.7434895634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3981.0, "completions/mean_length": 717.2936401367188, "completions/mean_terminated_length": 699.6040649414062, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.3536, "grad_norm": 0.0002461047552060336, "learning_rate": 1.8400000000000002e-06, "loss": 0.0055, "num_tokens": 273476228.0, "reward": 0.8616536855697632, "reward_std": 0.12376578152179718, "rewards/accuracy_reward": 0.7330729365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.0, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 4004.0, "completions/mean_length": 815.455078125, "completions/mean_terminated_length": 804.7413330078125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.3552, "grad_norm": 0.0002113583032041788, "learning_rate": 1.8200000000000002e-06, "loss": 0.003, "num_tokens": 274983423.0, "reward": 0.8352864980697632, "reward_std": 0.1211782917380333, "rewards/accuracy_reward": 0.6744791865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3822.0, "completions/mean_length": 803.5377807617188, "completions/mean_terminated_length": 801.392822265625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.3568, "grad_norm": 0.00020958564709872007, "learning_rate": 1.8000000000000001e-06, "loss": 0.0023, "num_tokens": 276470265.0, "reward": 0.8561198115348816, "reward_std": 0.12089913338422775, "rewards/accuracy_reward": 0.7141926884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2388.0, "completions/mean_length": 795.5697021484375, "completions/mean_terminated_length": 791.2666015625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.3584, "grad_norm": 0.0002054341894108802, "learning_rate": 1.7800000000000001e-06, "loss": 0.0019, "num_tokens": 277954180.0, "reward": 0.8570963740348816, "reward_std": 0.09880204498767853, "rewards/accuracy_reward": 0.7154948115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3779.0, "completions/mean_length": 729.8626708984375, "completions/mean_terminated_length": 714.451904296875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.36, "grad_norm": 0.0002290692355018109, "learning_rate": 1.76e-06, "loss": 0.0053, "num_tokens": 279328209.0, "reward": 0.8225911855697632, "reward_std": 0.12789355218410492, "rewards/accuracy_reward": 0.650390625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.0, "step": 225 }, { "epoch": 0.36, "eval_completions/clipped_ratio": 0.001953125, "eval_completions/max_length": 2818.75, "eval_completions/max_terminated_length": 2726.25, "eval_completions/mean_length": 729.7336273193359, "eval_completions/mean_terminated_length": 723.1668701171875, "eval_completions/min_length": 197.75, "eval_completions/min_terminated_length": 197.75, "eval_loss": 0.0, "eval_num_tokens": 279328209.0, "eval_reward": 0.84619140625, "eval_reward_std": 0.23193285055458546, "eval_rewards/accuracy_reward": 0.6962890625, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.99609375, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 185.3096, "eval_samples_per_second": 5.396, "eval_steps_per_second": 0.043, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3765.0, "completions/mean_length": 693.123046875, "completions/mean_terminated_length": 684.23828125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.3616, "grad_norm": 0.00026315837749280035, "learning_rate": 1.74e-06, "loss": 0.0035, "num_tokens": 280647182.0, "reward": 0.8902994990348816, "reward_std": 0.09217560291290283, "rewards/accuracy_reward": 0.7838541865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2168.0, "completions/mean_length": 674.552734375, "completions/mean_terminated_length": 670.0919189453125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.3632, "grad_norm": 0.00023676631099078804, "learning_rate": 1.72e-06, "loss": 0.0024, "num_tokens": 281929119.0, "reward": 0.8655599355697632, "reward_std": 0.09075771272182465, "rewards/accuracy_reward": 0.732421875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1991.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 648.2806396484375, "completions/mean_terminated_length": 648.2806396484375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.3648, "grad_norm": 0.0002063358697341755, "learning_rate": 1.7000000000000002e-06, "loss": 0.0014, "num_tokens": 283183438.0, "reward": 0.8896484375, "reward_std": 0.07809191942214966, "rewards/accuracy_reward": 0.779296875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2636.0, "completions/mean_length": 748.0736083984375, "completions/mean_terminated_length": 745.8925170898438, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.3664, "grad_norm": 0.00020575226517394185, "learning_rate": 1.6800000000000002e-06, "loss": -0.0012, "num_tokens": 284590591.0, "reward": 0.853515625, "reward_std": 0.09160354733467102, "rewards/accuracy_reward": 0.7076823115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3439.0, "completions/mean_length": 693.9928588867188, "completions/mean_terminated_length": 687.3353271484375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.368, "grad_norm": 0.0002574085956439376, "learning_rate": 1.6600000000000002e-06, "loss": 0.0032, "num_tokens": 285908244.0, "reward": 0.8743489980697632, "reward_std": 0.11725397408008575, "rewards/accuracy_reward": 0.7506510615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3727.0, "completions/mean_length": 743.94140625, "completions/mean_terminated_length": 737.381591796875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.3696, "grad_norm": 0.00026451697340235114, "learning_rate": 1.6400000000000002e-06, "loss": 0.0045, "num_tokens": 287312058.0, "reward": 0.85546875, "reward_std": 0.11085309088230133, "rewards/accuracy_reward": 0.7135416865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3496.0, "completions/max_terminated_length": 3496.0, "completions/mean_length": 707.3294677734375, "completions/mean_terminated_length": 707.3294677734375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.3712, "grad_norm": 0.0002390411973465234, "learning_rate": 1.6200000000000002e-06, "loss": 0.0023, "num_tokens": 288651124.0, "reward": 0.8792318105697632, "reward_std": 0.11159934848546982, "rewards/accuracy_reward": 0.7584635615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3436.0, "completions/mean_length": 686.7135620117188, "completions/mean_terminated_length": 684.4924926757812, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3728, "grad_norm": 0.0002625233319122344, "learning_rate": 1.6000000000000001e-06, "loss": 0.0023, "num_tokens": 289959964.0, "reward": 0.845703125, "reward_std": 0.10595854371786118, "rewards/accuracy_reward": 0.6927083134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 707.7077026367188, "completions/mean_terminated_length": 705.5003051757812, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.3744, "grad_norm": 0.0002784116950351745, "learning_rate": 1.5800000000000001e-06, "loss": 0.0035, "num_tokens": 291290779.0, "reward": 0.853515625, "reward_std": 0.08774536103010178, "rewards/accuracy_reward": 0.7076823115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3617.0, "completions/max_terminated_length": 3617.0, "completions/mean_length": 705.5032958984375, "completions/mean_terminated_length": 705.5032958984375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.376, "grad_norm": 0.00022184595582075417, "learning_rate": 1.56e-06, "loss": 0.0022, "num_tokens": 292633952.0, "reward": 0.8268229365348816, "reward_std": 0.09138500690460205, "rewards/accuracy_reward": 0.654296875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3605.0, "completions/mean_length": 739.71484375, "completions/mean_terminated_length": 735.3389892578125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.3776, "grad_norm": 0.00020852226589340717, "learning_rate": 1.54e-06, "loss": 0.0005, "num_tokens": 294015402.0, "reward": 0.8531901240348816, "reward_std": 0.09528094530105591, "rewards/accuracy_reward": 0.7083333134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2352.0, "completions/mean_length": 686.0026245117188, "completions/mean_terminated_length": 681.5567016601562, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.3792, "grad_norm": 0.00022435607388615608, "learning_rate": 1.52e-06, "loss": 0.0006, "num_tokens": 295321902.0, "reward": 0.8362630605697632, "reward_std": 0.09408549964427948, "rewards/accuracy_reward": 0.673828125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2974.0, "completions/mean_length": 704.5697021484375, "completions/mean_terminated_length": 697.9328002929688, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.3808, "grad_norm": 0.00026226439513266087, "learning_rate": 1.5e-06, "loss": -0.0004, "num_tokens": 296657241.0, "reward": 0.8095703125, "reward_std": 0.0988236516714096, "rewards/accuracy_reward": 0.62109375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2307.0, "completions/mean_length": 636.7018432617188, "completions/mean_terminated_length": 632.191650390625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.3824, "grad_norm": 0.0001824765931814909, "learning_rate": 1.48e-06, "loss": 0.0037, "num_tokens": 297872367.0, "reward": 0.8688151240348816, "reward_std": 0.0741402804851532, "rewards/accuracy_reward": 0.7389323115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3759.0, "completions/mean_length": 819.509765625, "completions/mean_terminated_length": 813.0978393554688, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.384, "grad_norm": 0.00023800460621714592, "learning_rate": 1.46e-06, "loss": 0.0018, "num_tokens": 299402078.0, "reward": 0.8274739980697632, "reward_std": 0.12115192413330078, "rewards/accuracy_reward": 0.658203125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 240 }, { "epoch": 0.384, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 2215.25, "eval_completions/max_terminated_length": 2215.25, "eval_completions/mean_length": 721.7137222290039, "eval_completions/mean_terminated_length": 721.7137222290039, "eval_completions/min_length": 223.625, "eval_completions/min_terminated_length": 223.625, "eval_loss": 0.0, "eval_num_tokens": 299402078.0, "eval_reward": 0.849609375, "eval_reward_std": 0.23077328875660896, "eval_rewards/accuracy_reward": 0.7001953125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9990234375, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 147.4213, "eval_samples_per_second": 6.783, "eval_steps_per_second": 0.054, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 865.8678588867188, "completions/mean_terminated_length": 838.296142578125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.3856, "grad_norm": 0.0001780574966687709, "learning_rate": 1.44e-06, "loss": 0.0044, "num_tokens": 300996627.0, "reward": 0.7845052480697632, "reward_std": 0.08591097593307495, "rewards/accuracy_reward": 0.5774739384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.0, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2203.0, "completions/max_terminated_length": 2203.0, "completions/mean_length": 667.2064208984375, "completions/mean_terminated_length": 667.2064208984375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.3872, "grad_norm": 0.00025171952438540757, "learning_rate": 1.42e-06, "loss": 0.0026, "num_tokens": 302285872.0, "reward": 0.8430989980697632, "reward_std": 0.10217689722776413, "rewards/accuracy_reward": 0.6861979365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3715.0, "completions/mean_length": 755.7611083984375, "completions/mean_terminated_length": 749.2244262695312, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.3888, "grad_norm": 0.00021929637296125293, "learning_rate": 1.4000000000000001e-06, "loss": 0.0031, "num_tokens": 303703009.0, "reward": 0.83984375, "reward_std": 0.089909128844738, "rewards/accuracy_reward": 0.6822916865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 781.4375, "completions/mean_terminated_length": 777.1160278320312, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.3904, "grad_norm": 0.0002218369481852278, "learning_rate": 1.3800000000000001e-06, "loss": 0.0036, "num_tokens": 305168097.0, "reward": 0.8352864980697632, "reward_std": 0.09312301874160767, "rewards/accuracy_reward": 0.6725260615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4078.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 746.35546875, "completions/mean_terminated_length": 746.35546875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.392, "grad_norm": 0.00021017546532675624, "learning_rate": 1.3600000000000001e-06, "loss": 0.002, "num_tokens": 306564419.0, "reward": 0.8414713740348816, "reward_std": 0.09496369957923889, "rewards/accuracy_reward": 0.68359375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4025.0, "completions/max_terminated_length": 4025.0, "completions/mean_length": 670.5358276367188, "completions/mean_terminated_length": 670.5358276367188, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.3936, "grad_norm": 0.00022105369134806097, "learning_rate": 1.34e-06, "loss": 0.0016, "num_tokens": 307858426.0, "reward": 0.8583984375, "reward_std": 0.08981170505285263, "rewards/accuracy_reward": 0.7174479365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3786.0, "completions/mean_length": 761.5579833984375, "completions/mean_terminated_length": 759.3856811523438, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.3952, "grad_norm": 0.00023969220637809485, "learning_rate": 1.32e-06, "loss": 0.0025, "num_tokens": 309279859.0, "reward": 0.8590494990348816, "reward_std": 0.11394667625427246, "rewards/accuracy_reward": 0.7200520634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2600.0, "completions/max_terminated_length": 2600.0, "completions/mean_length": 820.6322021484375, "completions/mean_terminated_length": 820.6322021484375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.3968, "grad_norm": 0.00021602684864774346, "learning_rate": 1.3e-06, "loss": 0.0021, "num_tokens": 310791870.0, "reward": 0.8056640625, "reward_std": 0.11231070756912231, "rewards/accuracy_reward": 0.611328125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4013.0, "completions/mean_length": 783.142578125, "completions/mean_terminated_length": 765.7977905273438, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.3984, "grad_norm": 0.00020689787925221026, "learning_rate": 1.28e-06, "loss": 0.0056, "num_tokens": 312234201.0, "reward": 0.8720703125, "reward_std": 0.1073552817106247, "rewards/accuracy_reward": 0.7506510615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 723.06640625, "completions/mean_terminated_length": 716.4657592773438, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.4, "grad_norm": 0.00025999598437920213, "learning_rate": 1.26e-06, "loss": 0.0012, "num_tokens": 313602399.0, "reward": 0.8248698115348816, "reward_std": 0.0987776443362236, "rewards/accuracy_reward": 0.6536458134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3507.0, "completions/mean_length": 655.0130615234375, "completions/mean_terminated_length": 652.7713012695312, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.4016, "grad_norm": 0.00023388808767776936, "learning_rate": 1.2400000000000002e-06, "loss": 0.0027, "num_tokens": 314863123.0, "reward": 0.8675130605697632, "reward_std": 0.08050936460494995, "rewards/accuracy_reward": 0.736328125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 679.814453125, "completions/mean_terminated_length": 675.3604736328125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.4032, "grad_norm": 0.0002586299669928849, "learning_rate": 1.2200000000000002e-06, "loss": 0.0028, "num_tokens": 316164726.0, "reward": 0.8567708730697632, "reward_std": 0.10077904164791107, "rewards/accuracy_reward": 0.71484375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3111.0, "completions/mean_length": 714.3327026367188, "completions/mean_terminated_length": 712.129638671875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.4048, "grad_norm": 0.0002254310093121603, "learning_rate": 1.2000000000000002e-06, "loss": 0.0008, "num_tokens": 317506773.0, "reward": 0.9176432490348816, "reward_std": 0.1048509031534195, "rewards/accuracy_reward": 0.8359375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3337.0, "completions/mean_length": 714.310546875, "completions/mean_terminated_length": 705.4810791015625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.4064, "grad_norm": 0.00022292938956525177, "learning_rate": 1.1800000000000001e-06, "loss": 0.0021, "num_tokens": 318861010.0, "reward": 0.8684896230697632, "reward_std": 0.10260581970214844, "rewards/accuracy_reward": 0.7395833134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3206.0, "completions/mean_length": 709.8014526367188, "completions/mean_terminated_length": 705.3865356445312, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.408, "grad_norm": 0.00022816927230451256, "learning_rate": 1.1600000000000001e-06, "loss": 0.003, "num_tokens": 320205185.0, "reward": 0.8180338740348816, "reward_std": 0.09039068222045898, "rewards/accuracy_reward": 0.6380208134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 255 }, { "epoch": 0.408, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 2377.875, "eval_completions/max_terminated_length": 2377.875, "eval_completions/mean_length": 725.1042709350586, "eval_completions/mean_terminated_length": 725.1042709350586, "eval_completions/min_length": 196.375, "eval_completions/min_terminated_length": 196.375, "eval_loss": 0.0, "eval_num_tokens": 320205185.0, "eval_reward": 0.84716796875, "eval_reward_std": 0.23151800595223904, "eval_rewards/accuracy_reward": 0.6953125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9990234375, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 156.6812, "eval_samples_per_second": 6.382, "eval_steps_per_second": 0.051, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 848.0579833984375, "completions/mean_terminated_length": 835.3209228515625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.4096, "grad_norm": 0.0002211683604400605, "learning_rate": 1.14e-06, "loss": 0.0053, "num_tokens": 321756282.0, "reward": 0.84375, "reward_std": 0.10334877669811249, "rewards/accuracy_reward": 0.693359375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3850.0, "completions/mean_length": 772.1276245117188, "completions/mean_terminated_length": 765.6229858398438, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.4112, "grad_norm": 0.0002086065651383251, "learning_rate": 1.12e-06, "loss": 0.0031, "num_tokens": 323206590.0, "reward": 0.8561198115348816, "reward_std": 0.09400318562984467, "rewards/accuracy_reward": 0.7141926884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3992.0, "completions/mean_length": 745.0885620117188, "completions/mean_terminated_length": 742.905517578125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.4128, "grad_norm": 0.00022387625358533114, "learning_rate": 1.1e-06, "loss": 0.0016, "num_tokens": 324614118.0, "reward": 0.8678385615348816, "reward_std": 0.0916781947016716, "rewards/accuracy_reward": 0.7369791865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3554.0, "completions/mean_length": 764.2044677734375, "completions/mean_terminated_length": 762.0338745117188, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.4144, "grad_norm": 0.0002262386551592499, "learning_rate": 1.08e-06, "loss": 0.0028, "num_tokens": 326038208.0, "reward": 0.8736979365348816, "reward_std": 0.10700567811727524, "rewards/accuracy_reward": 0.748046875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3856.0, "completions/mean_length": 709.8197021484375, "completions/mean_terminated_length": 703.193115234375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.416, "grad_norm": 0.00023218023125082254, "learning_rate": 1.06e-06, "loss": 0.0055, "num_tokens": 327381771.0, "reward": 0.8756510615348816, "reward_std": 0.08443930745124817, "rewards/accuracy_reward": 0.7532551884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4019.0, "completions/mean_length": 777.40234375, "completions/mean_terminated_length": 770.9080200195312, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.4176, "grad_norm": 0.0002234979037893936, "learning_rate": 1.04e-06, "loss": 0.002, "num_tokens": 328827221.0, "reward": 0.8551432490348816, "reward_std": 0.11680393666028976, "rewards/accuracy_reward": 0.712890625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2369.0, "completions/max_terminated_length": 2369.0, "completions/mean_length": 752.5390625, "completions/mean_terminated_length": 752.5390625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.4192, "grad_norm": 0.0002201717288699001, "learning_rate": 1.02e-06, "loss": 0.001, "num_tokens": 330242001.0, "reward": 0.8450521230697632, "reward_std": 0.10722117125988007, "rewards/accuracy_reward": 0.6901041865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 834.662109375, "completions/mean_terminated_length": 821.87255859375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.4208, "grad_norm": 0.00022804480977356434, "learning_rate": 1.0000000000000002e-06, "loss": 0.0015, "num_tokens": 331774538.0, "reward": 0.8606771230697632, "reward_std": 0.11335291713476181, "rewards/accuracy_reward": 0.7259114384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 715.8268432617188, "completions/mean_terminated_length": 698.1295776367188, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4224, "grad_norm": 0.00022174509649630636, "learning_rate": 9.800000000000001e-07, "loss": 0.0037, "num_tokens": 333122304.0, "reward": 0.8701171875, "reward_std": 0.08800150454044342, "rewards/accuracy_reward": 0.7454426884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3639.0, "completions/mean_length": 769.658203125, "completions/mean_terminated_length": 758.7948608398438, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.424, "grad_norm": 0.00020780535123776644, "learning_rate": 9.600000000000001e-07, "loss": 0.0042, "num_tokens": 334553139.0, "reward": 0.8502604365348816, "reward_std": 0.10006933659315109, "rewards/accuracy_reward": 0.7037760615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3761.0, "completions/mean_length": 738.7428588867188, "completions/mean_terminated_length": 725.5771484375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.4256, "grad_norm": 0.00026626145699992776, "learning_rate": 9.400000000000001e-07, "loss": 0.0062, "num_tokens": 335933064.0, "reward": 0.8792318105697632, "reward_std": 0.11898082494735718, "rewards/accuracy_reward": 0.7630208134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 698.4303588867188, "completions/mean_terminated_length": 694.0006103515625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.4272, "grad_norm": 0.0002829855657182634, "learning_rate": 9.200000000000001e-07, "loss": 0.0042, "num_tokens": 337267421.0, "reward": 0.8743489980697632, "reward_std": 0.09112297743558884, "rewards/accuracy_reward": 0.75, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 745.5690307617188, "completions/mean_terminated_length": 736.8211669921875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.4288, "grad_norm": 0.00020174207747913897, "learning_rate": 9.000000000000001e-07, "loss": 0.0037, "num_tokens": 338658215.0, "reward": 0.84765625, "reward_std": 0.09898428618907928, "rewards/accuracy_reward": 0.6979166865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3937.0, "completions/mean_length": 720.3932495117188, "completions/mean_terminated_length": 718.1941528320312, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.4304, "grad_norm": 0.00022782105952501297, "learning_rate": 8.8e-07, "loss": 0.0021, "num_tokens": 340021411.0, "reward": 0.830078125, "reward_std": 0.10542221367359161, "rewards/accuracy_reward": 0.6673176884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3937.0, "completions/mean_length": 751.0853271484375, "completions/mean_terminated_length": 746.7242431640625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.432, "grad_norm": 0.00021056954574305564, "learning_rate": 8.6e-07, "loss": 0.0013, "num_tokens": 341440742.0, "reward": 0.8395182490348816, "reward_std": 0.09241904318332672, "rewards/accuracy_reward": 0.6822916865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 270 }, { "epoch": 0.432, "eval_completions/clipped_ratio": 0.0009765625, "eval_completions/max_length": 2529.875, "eval_completions/max_terminated_length": 2301.125, "eval_completions/mean_length": 731.6545257568359, "eval_completions/mean_terminated_length": 728.3899230957031, "eval_completions/min_length": 221.375, "eval_completions/min_terminated_length": 221.375, "eval_loss": 0.0, "eval_num_tokens": 341440742.0, "eval_reward": 0.8427734375, "eval_reward_std": 0.2322057280689478, "eval_rewards/accuracy_reward": 0.6865234375, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9990234375, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 167.0144, "eval_samples_per_second": 5.988, "eval_steps_per_second": 0.048, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2346.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 643.9622802734375, "completions/mean_terminated_length": 643.9622802734375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.4336, "grad_norm": 0.00024367033620364964, "learning_rate": 8.400000000000001e-07, "loss": 0.0005, "num_tokens": 342671660.0, "reward": 0.8837890625, "reward_std": 0.08593781292438507, "rewards/accuracy_reward": 0.767578125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2075.0, "completions/mean_length": 787.2200927734375, "completions/mean_terminated_length": 782.9061279296875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.4352, "grad_norm": 0.00020836593466810882, "learning_rate": 8.200000000000001e-07, "loss": 0.0001, "num_tokens": 344137086.0, "reward": 0.8642578125, "reward_std": 0.10475035756826401, "rewards/accuracy_reward": 0.7298176884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 739.3639526367188, "completions/mean_terminated_length": 717.36767578125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.4368, "grad_norm": 0.00026614725356921554, "learning_rate": 8.000000000000001e-07, "loss": 0.0045, "num_tokens": 345525165.0, "reward": 0.8388671875, "reward_std": 0.11899750679731369, "rewards/accuracy_reward": 0.6848958134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 689.28125, "completions/mean_terminated_length": 682.614501953125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.4384, "grad_norm": 0.0002463553682900965, "learning_rate": 7.8e-07, "loss": 0.0041, "num_tokens": 346833821.0, "reward": 0.837890625, "reward_std": 0.10158007591962814, "rewards/accuracy_reward": 0.677734375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2375.0, "completions/mean_length": 740.0143432617188, "completions/mean_terminated_length": 737.8280029296875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.44, "grad_norm": 0.00021535903215408325, "learning_rate": 7.6e-07, "loss": 0.002, "num_tokens": 348219219.0, "reward": 0.8170573115348816, "reward_std": 0.10654354095458984, "rewards/accuracy_reward": 0.634765625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3927.0, "completions/mean_length": 749.7890625, "completions/mean_terminated_length": 734.4695434570312, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.4416, "grad_norm": 0.00022565819381270558, "learning_rate": 7.4e-07, "loss": 0.0022, "num_tokens": 349637263.0, "reward": 0.8258463740348816, "reward_std": 0.09841002523899078, "rewards/accuracy_reward": 0.6575520634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 850.6875, "completions/mean_terminated_length": 840.0888061523438, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.4432, "grad_norm": 0.000247880641836673, "learning_rate": 7.2e-07, "loss": 0.0064, "num_tokens": 351200847.0, "reward": 0.8479818105697632, "reward_std": 0.13545231521129608, "rewards/accuracy_reward": 0.7018229365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 698.8483276367188, "completions/mean_terminated_length": 696.6351928710938, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.4448, "grad_norm": 0.00021189654944464564, "learning_rate": 7.000000000000001e-07, "loss": 0.0051, "num_tokens": 352527782.0, "reward": 0.830078125, "reward_std": 0.10204245895147324, "rewards/accuracy_reward": 0.6627604365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3357.0, "completions/mean_length": 752.1217651367188, "completions/mean_terminated_length": 749.9432983398438, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.4464, "grad_norm": 0.0002113265945808962, "learning_rate": 6.800000000000001e-07, "loss": 0.0049, "num_tokens": 353932065.0, "reward": 0.873046875, "reward_std": 0.0940742939710617, "rewards/accuracy_reward": 0.7473958134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2299.0, "completions/max_terminated_length": 2299.0, "completions/mean_length": 734.0032958984375, "completions/mean_terminated_length": 734.0032958984375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.448, "grad_norm": 0.00024331352324225008, "learning_rate": 6.6e-07, "loss": 0.0003, "num_tokens": 355318406.0, "reward": 0.8502604365348816, "reward_std": 0.11429958045482635, "rewards/accuracy_reward": 0.7005208134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2147.0, "completions/max_terminated_length": 2147.0, "completions/mean_length": 751.4088745117188, "completions/mean_terminated_length": 751.4088745117188, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.4496, "grad_norm": 0.0001822277990868315, "learning_rate": 6.4e-07, "loss": 0.0001, "num_tokens": 356717242.0, "reward": 0.8955078125, "reward_std": 0.08489221334457397, "rewards/accuracy_reward": 0.791015625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3218.0, "completions/mean_length": 703.07421875, "completions/mean_terminated_length": 700.8638305664062, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.4512, "grad_norm": 0.00019573996542021632, "learning_rate": 6.200000000000001e-07, "loss": 0.004, "num_tokens": 358048780.0, "reward": 0.8929036855697632, "reward_std": 0.08383297920227051, "rewards/accuracy_reward": 0.7864583134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 768.2421875, "completions/mean_terminated_length": 755.1921997070312, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.4528, "grad_norm": 0.00024182167544495314, "learning_rate": 6.000000000000001e-07, "loss": 0.0032, "num_tokens": 359475232.0, "reward": 0.8567708730697632, "reward_std": 0.09553200006484985, "rewards/accuracy_reward": 0.71875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3779.0, "completions/mean_length": 752.1536865234375, "completions/mean_terminated_length": 741.233154296875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.4544, "grad_norm": 0.00019411960965953767, "learning_rate": 5.800000000000001e-07, "loss": 0.0036, "num_tokens": 360883020.0, "reward": 0.859375, "reward_std": 0.08536865562200546, "rewards/accuracy_reward": 0.72265625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2651.0, "completions/max_terminated_length": 2651.0, "completions/mean_length": 688.0091552734375, "completions/mean_terminated_length": 688.0091552734375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.456, "grad_norm": 0.0002204052871093154, "learning_rate": 5.6e-07, "loss": -0.0003, "num_tokens": 362187578.0, "reward": 0.8512369990348816, "reward_std": 0.09851017594337463, "rewards/accuracy_reward": 0.7024739384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 285 }, { "epoch": 0.456, "eval_completions/clipped_ratio": 0.00390625, "eval_completions/max_length": 3353.75, "eval_completions/max_terminated_length": 2226.125, "eval_completions/mean_length": 745.3162612915039, "eval_completions/mean_terminated_length": 732.1685333251953, "eval_completions/min_length": 219.75, "eval_completions/min_terminated_length": 219.75, "eval_loss": 0.0, "eval_num_tokens": 362187578.0, "eval_reward": 0.8408203125, "eval_reward_std": 0.2368363793939352, "eval_rewards/accuracy_reward": 0.6865234375, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9951171875, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 216.1448, "eval_samples_per_second": 4.627, "eval_steps_per_second": 0.037, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2176.0, "completions/mean_length": 682.5306396484375, "completions/mean_terminated_length": 678.0802001953125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.4576, "grad_norm": 0.00021837043459527194, "learning_rate": 5.4e-07, "loss": 0.0022, "num_tokens": 363485705.0, "reward": 0.8971354365348816, "reward_std": 0.08655188977718353, "rewards/accuracy_reward": 0.7955729365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2080.0, "completions/mean_length": 786.5514526367188, "completions/mean_terminated_length": 784.3954467773438, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.4592, "grad_norm": 0.00024860716075636446, "learning_rate": 5.2e-07, "loss": 0.0003, "num_tokens": 364957496.0, "reward": 0.8245443105697632, "reward_std": 0.11009486764669418, "rewards/accuracy_reward": 0.6497395634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 821.9694213867188, "completions/mean_terminated_length": 809.1300659179688, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.4608, "grad_norm": 0.00024540640879422426, "learning_rate": 5.000000000000001e-07, "loss": 0.0022, "num_tokens": 366479529.0, "reward": 0.82421875, "reward_std": 0.11675791442394257, "rewards/accuracy_reward": 0.654296875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4027.0, "completions/mean_length": 826.328125, "completions/mean_terminated_length": 796.2522583007812, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.4624, "grad_norm": 0.0001585317077115178, "learning_rate": 4.800000000000001e-07, "loss": 0.0026, "num_tokens": 367994369.0, "reward": 0.8173828125, "reward_std": 0.10059963911771774, "rewards/accuracy_reward": 0.646484375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3768.0, "completions/max_terminated_length": 3768.0, "completions/mean_length": 690.62890625, "completions/mean_terminated_length": 690.62890625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.464, "grad_norm": 0.00019122941012028605, "learning_rate": 4.6000000000000004e-07, "loss": 0.0014, "num_tokens": 369295623.0, "reward": 0.8492838740348816, "reward_std": 0.07566020637750626, "rewards/accuracy_reward": 0.69921875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 742.033203125, "completions/mean_terminated_length": 739.8482055664062, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.4656, "grad_norm": 0.00019408087246119976, "learning_rate": 4.4e-07, "loss": 0.0011, "num_tokens": 370676442.0, "reward": 0.8528646230697632, "reward_std": 0.10311831533908844, "rewards/accuracy_reward": 0.70703125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 794.3079833984375, "completions/mean_terminated_length": 774.8480834960938, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.4672, "grad_norm": 0.00018843395810108632, "learning_rate": 4.2000000000000006e-07, "loss": 0.0028, "num_tokens": 372152179.0, "reward": 0.8333333730697632, "reward_std": 0.09596847742795944, "rewards/accuracy_reward": 0.6731770634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 4016.0, "completions/mean_length": 735.3939208984375, "completions/mean_terminated_length": 731.0123901367188, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.4688, "grad_norm": 0.000248933385591954, "learning_rate": 4.0000000000000003e-07, "loss": 0.0045, "num_tokens": 373541616.0, "reward": 0.8645833730697632, "reward_std": 0.11629575490951538, "rewards/accuracy_reward": 0.732421875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3850.0, "completions/mean_length": 674.77734375, "completions/mean_terminated_length": 670.3168334960938, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.4704, "grad_norm": 0.00018482068844605237, "learning_rate": 3.8e-07, "loss": 0.0035, "num_tokens": 374811802.0, "reward": 0.84765625, "reward_std": 0.06946199387311935, "rewards/accuracy_reward": 0.697265625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2212.0, "completions/mean_length": 748.908203125, "completions/mean_terminated_length": 744.5443115234375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.472, "grad_norm": 0.00019267553579993546, "learning_rate": 3.6e-07, "loss": 0.0023, "num_tokens": 376206925.0, "reward": 0.8782552480697632, "reward_std": 0.0801345556974411, "rewards/accuracy_reward": 0.7578125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1984.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 717.1784057617188, "completions/mean_terminated_length": 717.1784057617188, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.4736, "grad_norm": 0.0001787830697139725, "learning_rate": 3.4000000000000003e-07, "loss": 0.0003, "num_tokens": 377570655.0, "reward": 0.861328125, "reward_std": 0.06910397857427597, "rewards/accuracy_reward": 0.72265625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3679.0, "completions/mean_length": 806.9127807617188, "completions/mean_terminated_length": 789.6924438476562, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4752, "grad_norm": 0.00022632272157352418, "learning_rate": 3.2e-07, "loss": 0.0071, "num_tokens": 379070553.0, "reward": 0.806640625, "reward_std": 0.12033604085445404, "rewards/accuracy_reward": 0.619140625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2632.0, "completions/mean_length": 711.392578125, "completions/mean_terminated_length": 706.9797973632812, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.4768, "grad_norm": 0.00023846919066272676, "learning_rate": 3.0000000000000004e-07, "loss": 0.0057, "num_tokens": 380405684.0, "reward": 0.8411458730697632, "reward_std": 0.10618966072797775, "rewards/accuracy_reward": 0.68359375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 770.9525146484375, "completions/mean_terminated_length": 770.9525146484375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.4784, "grad_norm": 0.00022093193547334522, "learning_rate": 2.8e-07, "loss": 0.0031, "num_tokens": 381833547.0, "reward": 0.8746744990348816, "reward_std": 0.11011002957820892, "rewards/accuracy_reward": 0.7493489384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3911.0, "completions/mean_length": 756.6920776367188, "completions/mean_terminated_length": 745.786376953125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.48, "grad_norm": 0.0002568686904851347, "learning_rate": 2.6e-07, "loss": 0.0021, "num_tokens": 383245522.0, "reward": 0.8369140625, "reward_std": 0.11426260322332382, "rewards/accuracy_reward": 0.6796875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 300 }, { "epoch": 0.48, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1950.625, "eval_completions/max_terminated_length": 1950.625, "eval_completions/mean_length": 733.9456176757812, "eval_completions/mean_terminated_length": 733.9456176757812, "eval_completions/min_length": 219.25, "eval_completions/min_terminated_length": 219.25, "eval_loss": 0.0, "eval_num_tokens": 383245522.0, "eval_reward": 0.84765625, "eval_reward_std": 0.2294679880142212, "eval_rewards/accuracy_reward": 0.6953125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 133.2939, "eval_samples_per_second": 7.502, "eval_steps_per_second": 0.06, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 4039.0, "completions/mean_length": 811.9818115234375, "completions/mean_terminated_length": 801.2566528320312, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.4816, "grad_norm": 0.00020574239897541702, "learning_rate": 2.4000000000000003e-07, "loss": 0.0026, "num_tokens": 384732854.0, "reward": 0.8479818105697632, "reward_std": 0.09479741752147675, "rewards/accuracy_reward": 0.7037760615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 707.0494995117188, "completions/mean_terminated_length": 704.8416748046875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.4832, "grad_norm": 0.000241117988480255, "learning_rate": 2.2e-07, "loss": 0.0031, "num_tokens": 386062626.0, "reward": 0.8912760615348816, "reward_std": 0.11178864538669586, "rewards/accuracy_reward": 0.783203125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 743.7239990234375, "completions/mean_terminated_length": 741.5400390625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.4848, "grad_norm": 0.00021438412659335881, "learning_rate": 2.0000000000000002e-07, "loss": 0.0015, "num_tokens": 387462234.0, "reward": 0.888671875, "reward_std": 0.10154387354850769, "rewards/accuracy_reward": 0.7779948115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2149.0, "completions/mean_length": 774.8763427734375, "completions/mean_terminated_length": 772.7127075195312, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.4864, "grad_norm": 0.00020632801169995219, "learning_rate": 1.8e-07, "loss": 0.0029, "num_tokens": 388906556.0, "reward": 0.8707682490348816, "reward_std": 0.10759743303060532, "rewards/accuracy_reward": 0.7421875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 712.5091552734375, "completions/mean_terminated_length": 705.8878173828125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.488, "grad_norm": 0.00017039969679899514, "learning_rate": 1.6e-07, "loss": 0.001, "num_tokens": 390252842.0, "reward": 0.9085286855697632, "reward_std": 0.06514652073383331, "rewards/accuracy_reward": 0.8190104365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3282.0, "completions/mean_length": 746.9486083984375, "completions/mean_terminated_length": 744.7667846679688, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.4896, "grad_norm": 0.0002216668362962082, "learning_rate": 1.4e-07, "loss": 0.0037, "num_tokens": 391654203.0, "reward": 0.8333333730697632, "reward_std": 0.1073928028345108, "rewards/accuracy_reward": 0.66796875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3614.0, "completions/mean_length": 804.5078125, "completions/mean_terminated_length": 800.2164306640625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.4912, "grad_norm": 0.00020991609198972583, "learning_rate": 1.2000000000000002e-07, "loss": 0.0038, "num_tokens": 393129959.0, "reward": 0.8577474355697632, "reward_std": 0.09668704122304916, "rewards/accuracy_reward": 0.716796875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2278.0, "completions/max_terminated_length": 2278.0, "completions/mean_length": 669.4935302734375, "completions/mean_terminated_length": 669.4935302734375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.4928, "grad_norm": 0.0002030891046160832, "learning_rate": 1.0000000000000001e-07, "loss": 0.0021, "num_tokens": 394412061.0, "reward": 0.9069010615348816, "reward_std": 0.07275529205799103, "rewards/accuracy_reward": 0.8138020634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2803.0, "completions/max_terminated_length": 2803.0, "completions/mean_length": 666.9166870117188, "completions/mean_terminated_length": 666.9166870117188, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.4944, "grad_norm": 0.00021266087424010038, "learning_rate": 8e-08, "loss": 0.0004, "num_tokens": 395688701.0, "reward": 0.8479818105697632, "reward_std": 0.07628273218870163, "rewards/accuracy_reward": 0.6959635615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3892.0, "completions/mean_length": 770.7584838867188, "completions/mean_terminated_length": 768.5921630859375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.496, "grad_norm": 0.0002053970965789631, "learning_rate": 6.000000000000001e-08, "loss": 0.0033, "num_tokens": 397125770.0, "reward": 0.8515625, "reward_std": 0.09819101542234421, "rewards/accuracy_reward": 0.7044270634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3282.0, "completions/mean_length": 834.8568115234375, "completions/mean_terminated_length": 828.4749145507812, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.4976, "grad_norm": 0.0002385937696089968, "learning_rate": 4e-08, "loss": 0.0029, "num_tokens": 398660142.0, "reward": 0.83203125, "reward_std": 0.1356370747089386, "rewards/accuracy_reward": 0.666015625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4066.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 756.2623901367188, "completions/mean_terminated_length": 756.2623901367188, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.4992, "grad_norm": 0.0002649789385031909, "learning_rate": 2e-08, "loss": 0.0024, "num_tokens": 400084417.0, "reward": 0.8600260615348816, "reward_std": 0.1263372302055359, "rewards/accuracy_reward": 0.7213541865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3995.0, "completions/mean_length": 774.5189208984375, "completions/mean_terminated_length": 765.8466186523438, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.5008, "grad_norm": 0.00019712450739461929, "learning_rate": 0.0, "loss": 0.0036, "num_tokens": 401528574.0, "reward": 0.85546875, "reward_std": 0.08956264704465866, "rewards/accuracy_reward": 0.71484375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 313 }, { "epoch": 0.5008, "step": 313, "total_flos": 0.0, "train_loss": 0.005064463953700693, "train_runtime": 58590.3695, "train_samples_per_second": 0.256, "train_steps_per_second": 0.005 } ], "logging_steps": 1, "max_steps": 313, "num_input_tokens_seen": 401528574, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }