{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 15, "global_step": 78, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0595703125, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 252.85693359375, "completions/mean_terminated_length": 204.00987243652344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0128, "grad_norm": 0.01206118892878294, "learning_rate": 2.5e-07, "loss": 0.062, "num_tokens": 3474099.0, "reward": 0.646336019039154, "reward_std": 0.503198504447937, "rewards/accuracy_reward": 0.24609375, "rewards/brier_reward": 0.37909579277038574, "rewards/confidence_one_or_zero": 0.26953125, "rewards/format_reward": 0.66748046875, "rewards/mean_confidence_reward": 0.7516889572143555, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0556640625, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 257.6103515625, "completions/mean_terminated_length": 212.43536376953125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0256, "grad_norm": 0.05630214512348175, "learning_rate": 5e-07, "loss": 0.06, "num_tokens": 7050869.0, "reward": 0.5985734462738037, "reward_std": 0.4480513036251068, "rewards/accuracy_reward": 0.171875, "rewards/brier_reward": 0.3416762948036194, "rewards/confidence_one_or_zero": 0.25537109375, "rewards/format_reward": 0.68359375, "rewards/mean_confidence_reward": 0.7263393402099609, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 254.6923828125, "completions/mean_terminated_length": 211.86495971679688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0384, "grad_norm": 0.02448289282619953, "learning_rate": 7.5e-07, "loss": 0.0532, "num_tokens": 10556279.0, "reward": 0.6241230964660645, "reward_std": 0.45209184288978577, "rewards/accuracy_reward": 0.203125, "rewards/brier_reward": 0.36250197887420654, "rewards/confidence_one_or_zero": 0.26611328125, "rewards/format_reward": 0.6826171875, "rewards/mean_confidence_reward": 0.7384810447692871, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05224609375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 255.78857421875, "completions/mean_terminated_length": 213.43997192382812, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0512, "grad_norm": 0.009398565627634525, "learning_rate": 1e-06, "loss": 0.0588, "num_tokens": 14071174.0, "reward": 0.625983715057373, "reward_std": 0.46379736065864563, "rewards/accuracy_reward": 0.1982421875, "rewards/brier_reward": 0.3657350540161133, "rewards/confidence_one_or_zero": 0.271484375, "rewards/format_reward": 0.68798828125, "rewards/mean_confidence_reward": 0.7340266704559326, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04931640625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 250.24267578125, "completions/mean_terminated_length": 210.104248046875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.064, "grad_norm": 0.010432912968099117, "learning_rate": 1e-06, "loss": 0.0609, "num_tokens": 17581279.0, "reward": 0.6377379894256592, "reward_std": 0.4767524302005768, "rewards/accuracy_reward": 0.2119140625, "rewards/brier_reward": 0.37361857295036316, "rewards/confidence_one_or_zero": 0.2607421875, "rewards/format_reward": 0.68994140625, "rewards/mean_confidence_reward": 0.72353196144104, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0419921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 233.49072265625, "completions/mean_terminated_length": 198.8404541015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0768, "grad_norm": 0.03446255251765251, "learning_rate": 1e-06, "loss": 0.0446, "num_tokens": 21031396.0, "reward": 0.7123662233352661, "reward_std": 0.4492800533771515, "rewards/accuracy_reward": 0.2568359375, "rewards/brier_reward": 0.41545307636260986, "rewards/confidence_one_or_zero": 0.2783203125, "rewards/format_reward": 0.75244140625, "rewards/mean_confidence_reward": 0.7652001976966858, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0322265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 218.8916015625, "completions/mean_terminated_length": 192.08172607421875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0896, "grad_norm": 0.03061598353087902, "learning_rate": 1e-06, "loss": 0.043, "num_tokens": 24512238.0, "reward": 0.7533546686172485, "reward_std": 0.40088847279548645, "rewards/accuracy_reward": 0.25, "rewards/brier_reward": 0.43981271982192993, "rewards/confidence_one_or_zero": 0.259765625, "rewards/format_reward": 0.81689453125, "rewards/mean_confidence_reward": 0.7687591314315796, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 187.7744140625, "completions/mean_terminated_length": 172.81211853027344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.1024, "grad_norm": 0.014748022891581059, "learning_rate": 1e-06, "loss": 0.0338, "num_tokens": 27904744.0, "reward": 0.8088976144790649, "reward_std": 0.3649292290210724, "rewards/accuracy_reward": 0.26953125, "rewards/brier_reward": 0.46837902069091797, "rewards/confidence_one_or_zero": 0.2646484375, "rewards/format_reward": 0.8798828125, "rewards/mean_confidence_reward": 0.7798730134963989, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 175.26171875, "completions/mean_terminated_length": 161.78968811035156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.1152, "grad_norm": 0.1248084306716919, "learning_rate": 1e-06, "loss": 0.0267, "num_tokens": 31256936.0, "reward": 0.8849074840545654, "reward_std": 0.3317580819129944, "rewards/accuracy_reward": 0.32470703125, "rewards/brier_reward": 0.5227425694465637, "rewards/confidence_one_or_zero": 0.2705078125, "rewards/format_reward": 0.92236328125, "rewards/mean_confidence_reward": 0.7816927433013916, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 162.44384765625, "completions/mean_terminated_length": 148.7683563232422, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.128, "grad_norm": 0.013320323079824448, "learning_rate": 1e-06, "loss": 0.022, "num_tokens": 34584917.0, "reward": 0.8880295753479004, "reward_std": 0.3083075284957886, "rewards/accuracy_reward": 0.30712890625, "rewards/brier_reward": 0.5314282178878784, "rewards/confidence_one_or_zero": 0.2568359375, "rewards/format_reward": 0.9375, "rewards/mean_confidence_reward": 0.7687011957168579, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00927734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 149.92822265625, "completions/mean_terminated_length": 141.74322509765625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1408, "grad_norm": 0.009336220100522041, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 37879610.0, "reward": 0.9682935476303101, "reward_std": 0.30119815468788147, "rewards/accuracy_reward": 0.39697265625, "rewards/brier_reward": 0.5859990119934082, "rewards/confidence_one_or_zero": 0.27587890625, "rewards/format_reward": 0.95361328125, "rewards/mean_confidence_reward": 0.7835807204246521, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 148.54931640625, "completions/mean_terminated_length": 143.38949584960938, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1536, "grad_norm": 0.01234795618802309, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 41182895.0, "reward": 0.9633985757827759, "reward_std": 0.27885445952415466, "rewards/accuracy_reward": 0.36572265625, "rewards/brier_reward": 0.588416337966919, "rewards/confidence_one_or_zero": 0.21923828125, "rewards/format_reward": 0.97265625, "rewards/mean_confidence_reward": 0.7653173804283142, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0029296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 134.34326171875, "completions/mean_terminated_length": 131.72918701171875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.1664, "grad_norm": 0.011425751261413097, "learning_rate": 1e-06, "loss": 0.0041, "num_tokens": 44485246.0, "reward": 0.9407795071601868, "reward_std": 0.2531838119029999, "rewards/accuracy_reward": 0.32666015625, "rewards/brier_reward": 0.5758930444717407, "rewards/confidence_one_or_zero": 0.20947265625, "rewards/format_reward": 0.97900390625, "rewards/mean_confidence_reward": 0.75456702709198, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0029296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 138.37548828125, "completions/mean_terminated_length": 135.77325439453125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.1792, "grad_norm": 0.004313925746828318, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 47816527.0, "reward": 0.9434206485748291, "reward_std": 0.24209100008010864, "rewards/accuracy_reward": 0.31884765625, "rewards/brier_reward": 0.5865465402603149, "rewards/confidence_one_or_zero": 0.15283203125, "rewards/format_reward": 0.9814453125, "rewards/mean_confidence_reward": 0.7383813858032227, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00439453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 133.2216796875, "completions/mean_terminated_length": 129.28985595703125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.192, "grad_norm": 0.00784427672624588, "learning_rate": 1e-06, "loss": 0.006, "num_tokens": 51033389.0, "reward": 0.983768105506897, "reward_std": 0.24871209263801575, "rewards/accuracy_reward": 0.361328125, "rewards/brier_reward": 0.623296320438385, "rewards/confidence_one_or_zero": 0.1484375, "rewards/format_reward": 0.98291015625, "rewards/mean_confidence_reward": 0.7346706986427307, "step": 15 }, { "epoch": 0.192, "eval_completions/clipped_ratio": 0.00390625, "eval_completions/max_length": 675.0, "eval_completions/max_terminated_length": 347.5, "eval_completions/mean_length": 134.63638305664062, "eval_completions/mean_terminated_length": 131.14013671875, "eval_completions/min_length": 56.0, "eval_completions/min_terminated_length": 56.0, "eval_loss": 0.0, "eval_num_tokens": 51033389.0, "eval_reward": 0.9371011257171631, "eval_reward_std": 0.3611362501978874, "eval_rewards/accuracy_reward": 0.30078125, "eval_rewards/brier_reward": 0.5890443176031113, "eval_rewards/confidence_one_or_zero": 0.12109375, "eval_rewards/format_reward": 0.984375, "eval_rewards/mean_confidence_reward": 0.712636724114418, "eval_runtime": 50.8118, "eval_samples_per_second": 9.84, "eval_steps_per_second": 0.079, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00244140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 136.1689453125, "completions/mean_terminated_length": 133.99607849121094, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.2048, "grad_norm": 0.015799518674612045, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 54328887.0, "reward": 0.9970545768737793, "reward_std": 0.23557817935943604, "rewards/accuracy_reward": 0.38330078125, "rewards/brier_reward": 0.6230137348175049, "rewards/confidence_one_or_zero": 0.13623046875, "rewards/format_reward": 0.98779296875, "rewards/mean_confidence_reward": 0.7369694113731384, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00244140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 130.96484375, "completions/mean_terminated_length": 128.7792510986328, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.2176, "grad_norm": 0.011801626533269882, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 57602287.0, "reward": 0.9894813299179077, "reward_std": 0.22355304658412933, "rewards/accuracy_reward": 0.35205078125, "rewards/brier_reward": 0.6386289596557617, "rewards/confidence_one_or_zero": 0.13720703125, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.7070361375808716, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00244140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 133.64404296875, "completions/mean_terminated_length": 131.46499633789062, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.2304, "grad_norm": 0.003088222583755851, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 60943718.0, "reward": 1.0247716903686523, "reward_std": 0.2183222770690918, "rewards/accuracy_reward": 0.39892578125, "rewards/brier_reward": 0.6613582372665405, "rewards/confidence_one_or_zero": 0.11376953125, "rewards/format_reward": 0.9892578125, "rewards/mean_confidence_reward": 0.7037431001663208, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1024.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 130.42626953125, "completions/mean_terminated_length": 128.67759704589844, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.2432, "grad_norm": 0.0030834106728434563, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 64218495.0, "reward": 1.0376394987106323, "reward_std": 0.22437486052513123, "rewards/accuracy_reward": 0.40869140625, "rewards/brier_reward": 0.6768399477005005, "rewards/confidence_one_or_zero": 0.10986328125, "rewards/format_reward": 0.98974609375, "rewards/mean_confidence_reward": 0.6858630180358887, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 129.1884765625, "completions/mean_terminated_length": 128.31378173828125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.256, "grad_norm": 0.0019296990940347314, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 67508849.0, "reward": 1.0304187536239624, "reward_std": 0.20604413747787476, "rewards/accuracy_reward": 0.39013671875, "rewards/brier_reward": 0.6755821108818054, "rewards/confidence_one_or_zero": 0.0966796875, "rewards/format_reward": 0.9951171875, "rewards/mean_confidence_reward": 0.6770117282867432, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 134.54052734375, "completions/mean_terminated_length": 134.1060028076172, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.2688, "grad_norm": 0.0030771668534725904, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 70787884.0, "reward": 1.0380725860595703, "reward_std": 0.197679340839386, "rewards/accuracy_reward": 0.39208984375, "rewards/brier_reward": 0.6879599690437317, "rewards/confidence_one_or_zero": 0.0791015625, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.6546210646629333, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00146484375, "completions/max_length": 1024.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 131.81640625, "completions/mean_terminated_length": 130.50758361816406, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.2816, "grad_norm": 0.001605757512152195, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 74049700.0, "reward": 1.0163131952285767, "reward_std": 0.19691747426986694, "rewards/accuracy_reward": 0.35302734375, "rewards/brier_reward": 0.6854572296142578, "rewards/confidence_one_or_zero": 0.076171875, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.6287341713905334, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 130.5732421875, "completions/mean_terminated_length": 130.5732421875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.2944, "grad_norm": 0.0016451003029942513, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 77286914.0, "reward": 1.052234411239624, "reward_std": 0.19646784663200378, "rewards/accuracy_reward": 0.39404296875, "rewards/brier_reward": 0.7128661274909973, "rewards/confidence_one_or_zero": 0.06005859375, "rewards/format_reward": 0.99755859375, "rewards/mean_confidence_reward": 0.6058691143989563, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 135.2314453125, "completions/mean_terminated_length": 134.36265563964844, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3072, "grad_norm": 0.0020764193031936884, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 80575708.0, "reward": 1.030700922012329, "reward_std": 0.17984464764595032, "rewards/accuracy_reward": 0.357421875, "rewards/brier_reward": 0.7069083452224731, "rewards/confidence_one_or_zero": 0.05517578125, "rewards/format_reward": 0.9970703125, "rewards/mean_confidence_reward": 0.5960919857025146, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 135.84375, "completions/mean_terminated_length": 134.97555541992188, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.32, "grad_norm": 0.0012462260201573372, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 83862556.0, "reward": 1.0884177684783936, "reward_std": 0.16984757781028748, "rewards/accuracy_reward": 0.43359375, "rewards/brier_reward": 0.7461701035499573, "rewards/confidence_one_or_zero": 0.060546875, "rewards/format_reward": 0.9970703125, "rewards/mean_confidence_reward": 0.5713028907775879, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00146484375, "completions/max_length": 1024.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 138.23193359375, "completions/mean_terminated_length": 136.93251037597656, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.3328, "grad_norm": 0.0014977873070165515, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 87091063.0, "reward": 1.076505184173584, "reward_std": 0.16160905361175537, "rewards/accuracy_reward": 0.4140625, "rewards/brier_reward": 0.741388201713562, "rewards/confidence_one_or_zero": 0.0517578125, "rewards/format_reward": 0.99755859375, "rewards/mean_confidence_reward": 0.5425801277160645, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 137.69921875, "completions/mean_terminated_length": 136.8328399658203, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.3456, "grad_norm": 0.0013432031264528632, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 90390975.0, "reward": 1.0520920753479004, "reward_std": 0.1620258092880249, "rewards/accuracy_reward": 0.36328125, "rewards/brier_reward": 0.7428549528121948, "rewards/confidence_one_or_zero": 0.048828125, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.5067415237426758, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 139.10302734375, "completions/mean_terminated_length": 138.6707305908203, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.3584, "grad_norm": 0.001807131338864565, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 93687954.0, "reward": 1.068045735359192, "reward_std": 0.14087209105491638, "rewards/accuracy_reward": 0.3828125, "rewards/brier_reward": 0.7547427415847778, "rewards/confidence_one_or_zero": 0.04541015625, "rewards/format_reward": 0.99853515625, "rewards/mean_confidence_reward": 0.4919547438621521, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 144.34716796875, "completions/mean_terminated_length": 143.9174346923828, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3712, "grad_norm": 0.0023208060301840305, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 97022633.0, "reward": 1.065900206565857, "reward_std": 0.13991808891296387, "rewards/accuracy_reward": 0.37255859375, "rewards/brier_reward": 0.7607055902481079, "rewards/confidence_one_or_zero": 0.05712890625, "rewards/format_reward": 0.99853515625, "rewards/mean_confidence_reward": 0.4605600833892822, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 143.078125, "completions/mean_terminated_length": 143.078125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.384, "grad_norm": 0.0033083283342421055, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 100277401.0, "reward": 1.080049753189087, "reward_std": 0.13700106739997864, "rewards/accuracy_reward": 0.396484375, "rewards/brier_reward": 0.7650789618492126, "rewards/confidence_one_or_zero": 0.0498046875, "rewards/format_reward": 0.99853515625, "rewards/mean_confidence_reward": 0.4552270770072937, "step": 30 }, { "epoch": 0.384, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 353.25, "eval_completions/max_terminated_length": 353.25, "eval_completions/mean_length": 144.8663787841797, "eval_completions/mean_terminated_length": 144.8663787841797, "eval_completions/min_length": 73.75, "eval_completions/min_terminated_length": 73.75, "eval_loss": 0.0, "eval_num_tokens": 100277401.0, "eval_reward": 1.0514324307441711, "eval_reward_std": 0.226553276181221, "eval_rewards/accuracy_reward": 0.330078125, "eval_rewards/brier_reward": 0.7727857530117035, "eval_rewards/confidence_one_or_zero": 0.0546875, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.4388085976243019, "eval_runtime": 36.7184, "eval_samples_per_second": 13.617, "eval_steps_per_second": 0.109, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 143.3134765625, "completions/mean_terminated_length": 142.88323974609375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.3968, "grad_norm": 0.0009655402973294258, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 103574067.0, "reward": 1.0772579908370972, "reward_std": 0.13317140936851501, "rewards/accuracy_reward": 0.3837890625, "rewards/brier_reward": 0.7721908092498779, "rewards/confidence_one_or_zero": 0.05224609375, "rewards/format_reward": 0.99853515625, "rewards/mean_confidence_reward": 0.4339550733566284, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 147.052734375, "completions/mean_terminated_length": 146.62432861328125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.4096, "grad_norm": 0.0014063868438825011, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 106852399.0, "reward": 1.0804073810577393, "reward_std": 0.12738269567489624, "rewards/accuracy_reward": 0.39697265625, "rewards/brier_reward": 0.7653061151504517, "rewards/confidence_one_or_zero": 0.04296875, "rewards/format_reward": 0.99853515625, "rewards/mean_confidence_reward": 0.42521679401397705, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 144.61669921875, "completions/mean_terminated_length": 144.18710327148438, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.4224, "grad_norm": 0.0014064498245716095, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 110157062.0, "reward": 1.0807609558105469, "reward_std": 0.1330416202545166, "rewards/accuracy_reward": 0.39404296875, "rewards/brier_reward": 0.7699194550514221, "rewards/confidence_one_or_zero": 0.048828125, "rewards/format_reward": 0.99755859375, "rewards/mean_confidence_reward": 0.4001578688621521, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 150.60107421875, "completions/mean_terminated_length": 150.60107421875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.4352, "grad_norm": 0.0029456529300659895, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 113450349.0, "reward": 1.104508876800537, "reward_std": 0.11853398382663727, "rewards/accuracy_reward": 0.43408203125, "rewards/brier_reward": 0.7754232287406921, "rewards/confidence_one_or_zero": 0.04541015625, "rewards/format_reward": 0.99951171875, "rewards/mean_confidence_reward": 0.4032275378704071, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 151.95751953125, "completions/mean_terminated_length": 151.95751953125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.448, "grad_norm": 0.0026214425452053547, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 116759078.0, "reward": 1.0844058990478516, "reward_std": 0.11485770344734192, "rewards/accuracy_reward": 0.408203125, "rewards/brier_reward": 0.7615846395492554, "rewards/confidence_one_or_zero": 0.044921875, "rewards/format_reward": 0.9990234375, "rewards/mean_confidence_reward": 0.38147109746932983, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 153.82666015625, "completions/mean_terminated_length": 153.82666015625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.4608, "grad_norm": 0.0012385396985337138, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 120127755.0, "reward": 1.0605785846710205, "reward_std": 0.10509373247623444, "rewards/accuracy_reward": 0.3427734375, "rewards/brier_reward": 0.7788711190223694, "rewards/confidence_one_or_zero": 0.04248046875, "rewards/format_reward": 0.99951171875, "rewards/mean_confidence_reward": 0.3768359422683716, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 153.3173828125, "completions/mean_terminated_length": 152.8920440673828, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.4736, "grad_norm": 0.002045744564384222, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 123451997.0, "reward": 1.0735026597976685, "reward_std": 0.10574886202812195, "rewards/accuracy_reward": 0.37060546875, "rewards/brier_reward": 0.7773755788803101, "rewards/confidence_one_or_zero": 0.0498046875, "rewards/format_reward": 0.9990234375, "rewards/mean_confidence_reward": 0.3696533143520355, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 152.341796875, "completions/mean_terminated_length": 152.341796875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.4864, "grad_norm": 0.003814885625615716, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 126776897.0, "reward": 1.097827434539795, "reward_std": 0.10771030187606812, "rewards/accuracy_reward": 0.439453125, "rewards/brier_reward": 0.7566891312599182, "rewards/confidence_one_or_zero": 0.05615234375, "rewards/format_reward": 0.99951171875, "rewards/mean_confidence_reward": 0.34937989711761475, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 156.4228515625, "completions/mean_terminated_length": 156.4228515625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.4992, "grad_norm": 0.0009488159557804465, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 130164187.0, "reward": 1.098426103591919, "reward_std": 0.1080411821603775, "rewards/accuracy_reward": 0.44775390625, "rewards/brier_reward": 0.7495858073234558, "rewards/confidence_one_or_zero": 0.04248046875, "rewards/format_reward": 0.99951171875, "rewards/mean_confidence_reward": 0.35322752594947815, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 155.7001953125, "completions/mean_terminated_length": 155.7001953125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.512, "grad_norm": 0.0009362637065351009, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 133507421.0, "reward": 1.0903565883636475, "reward_std": 0.10990739613771439, "rewards/accuracy_reward": 0.41455078125, "rewards/brier_reward": 0.7676265239715576, "rewards/confidence_one_or_zero": 0.03759765625, "rewards/format_reward": 0.99853515625, "rewards/mean_confidence_reward": 0.3521386981010437, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 155.94287109375, "completions/mean_terminated_length": 155.94287109375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.5248, "grad_norm": 0.0018933570245280862, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 136822400.0, "reward": 1.0992754697799683, "reward_std": 0.10722452402114868, "rewards/accuracy_reward": 0.42822265625, "rewards/brier_reward": 0.7717922925949097, "rewards/confidence_one_or_zero": 0.05810546875, "rewards/format_reward": 0.99853515625, "rewards/mean_confidence_reward": 0.35319823026657104, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 159.2255859375, "completions/mean_terminated_length": 159.2255859375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.5376, "grad_norm": 0.0014252394903451204, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 140161310.0, "reward": 1.1002483367919922, "reward_std": 0.10786743462085724, "rewards/accuracy_reward": 0.43994140625, "rewards/brier_reward": 0.7615311145782471, "rewards/confidence_one_or_zero": 0.04345703125, "rewards/format_reward": 0.9990234375, "rewards/mean_confidence_reward": 0.3506225645542145, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 160.8203125, "completions/mean_terminated_length": 160.8203125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.5504, "grad_norm": 0.0009207865223288536, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 143509286.0, "reward": 1.1000754833221436, "reward_std": 0.11076341569423676, "rewards/accuracy_reward": 0.44384765625, "rewards/brier_reward": 0.7572791576385498, "rewards/confidence_one_or_zero": 0.03955078125, "rewards/format_reward": 0.9990234375, "rewards/mean_confidence_reward": 0.35358887910842896, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 160.43994140625, "completions/mean_terminated_length": 160.43994140625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.5632, "grad_norm": 0.0006189781124703586, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 146836963.0, "reward": 1.0809619426727295, "reward_std": 0.09901401400566101, "rewards/accuracy_reward": 0.38720703125, "rewards/brier_reward": 0.7761809825897217, "rewards/confidence_one_or_zero": 0.04248046875, "rewards/format_reward": 0.99853515625, "rewards/mean_confidence_reward": 0.34800535440444946, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 159.8115234375, "completions/mean_terminated_length": 159.8115234375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.576, "grad_norm": 0.02215094119310379, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 150188409.0, "reward": 1.0951708555221558, "reward_std": 0.10278713703155518, "rewards/accuracy_reward": 0.4228515625, "rewards/brier_reward": 0.7679775953292847, "rewards/confidence_one_or_zero": 0.033203125, "rewards/format_reward": 0.99951171875, "rewards/mean_confidence_reward": 0.35493797063827515, "step": 45 }, { "epoch": 0.576, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 287.75, "eval_completions/max_terminated_length": 287.75, "eval_completions/mean_length": 161.08263778686523, "eval_completions/mean_terminated_length": 161.08263778686523, "eval_completions/min_length": 85.5, "eval_completions/min_terminated_length": 85.5, "eval_loss": 0.0, "eval_num_tokens": 150188409.0, "eval_reward": 1.0704601407051086, "eval_reward_std": 0.18598725646734238, "eval_rewards/accuracy_reward": 0.349609375, "eval_rewards/brier_reward": 0.7913101464509964, "eval_rewards/confidence_one_or_zero": 0.03125, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.36238280683755875, "eval_runtime": 33.0054, "eval_samples_per_second": 15.149, "eval_steps_per_second": 0.121, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 159.986328125, "completions/mean_terminated_length": 159.986328125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.5888, "grad_norm": 0.003660782240331173, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 153552477.0, "reward": 1.095861792564392, "reward_std": 0.0982513576745987, "rewards/accuracy_reward": 0.4150390625, "rewards/brier_reward": 0.7776603102684021, "rewards/confidence_one_or_zero": 0.048828125, "rewards/format_reward": 0.9990234375, "rewards/mean_confidence_reward": 0.36439940333366394, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 161.75146484375, "completions/mean_terminated_length": 161.75146484375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.6016, "grad_norm": 0.004005622584372759, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 156878152.0, "reward": 1.1043840646743774, "reward_std": 0.10187876969575882, "rewards/accuracy_reward": 0.43408203125, "rewards/brier_reward": 0.7761501669883728, "rewards/confidence_one_or_zero": 0.03759765625, "rewards/format_reward": 0.99853515625, "rewards/mean_confidence_reward": 0.3677002191543579, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 163.7666015625, "completions/mean_terminated_length": 163.7666015625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.6144, "grad_norm": 0.0013804073678329587, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 160274850.0, "reward": 1.1110379695892334, "reward_std": 0.09990298002958298, "rewards/accuracy_reward": 0.45361328125, "rewards/brier_reward": 0.7684618234634399, "rewards/confidence_one_or_zero": 0.0341796875, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.3786962926387787, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 165.44677734375, "completions/mean_terminated_length": 165.44677734375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.6272, "grad_norm": 0.0006175300804898143, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 163673933.0, "reward": 1.1008734703063965, "reward_std": 0.1041068434715271, "rewards/accuracy_reward": 0.43603515625, "rewards/brier_reward": 0.7657108902931213, "rewards/confidence_one_or_zero": 0.0205078125, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.38051414489746094, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 166.4296875, "completions/mean_terminated_length": 166.4296875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.64, "grad_norm": 0.0007595557253807783, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 167075853.0, "reward": 1.119231104850769, "reward_std": 0.10055085271596909, "rewards/accuracy_reward": 0.4638671875, "rewards/brier_reward": 0.7750825881958008, "rewards/confidence_one_or_zero": 0.02685546875, "rewards/format_reward": 0.99951171875, "rewards/mean_confidence_reward": 0.40596190094947815, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 166.87060546875, "completions/mean_terminated_length": 166.87060546875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.6528, "grad_norm": 0.0037527712993323803, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 170378204.0, "reward": 1.0836341381072998, "reward_std": 0.10692138969898224, "rewards/accuracy_reward": 0.39794921875, "rewards/brier_reward": 0.7717597484588623, "rewards/confidence_one_or_zero": 0.02587890625, "rewards/format_reward": 0.99755859375, "rewards/mean_confidence_reward": 0.4016748070716858, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 165.9951171875, "completions/mean_terminated_length": 165.9951171875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.6656, "grad_norm": 0.0017999252304434776, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 173709242.0, "reward": 1.1069138050079346, "reward_std": 0.10350505262613297, "rewards/accuracy_reward": 0.4482421875, "rewards/brier_reward": 0.7655847072601318, "rewards/confidence_one_or_zero": 0.02392578125, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.4061328172683716, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 167.8916015625, "completions/mean_terminated_length": 167.8916015625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.6784, "grad_norm": 0.002350540366023779, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 177022084.0, "reward": 1.08831787109375, "reward_std": 0.10483807325363159, "rewards/accuracy_reward": 0.39990234375, "rewards/brier_reward": 0.7767325639724731, "rewards/confidence_one_or_zero": 0.02294921875, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.4102539122104645, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 170.22705078125, "completions/mean_terminated_length": 170.22705078125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.6912, "grad_norm": 0.0024247784167528152, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 180378381.0, "reward": 1.0937700271606445, "reward_std": 0.11120226234197617, "rewards/accuracy_reward": 0.41552734375, "rewards/brier_reward": 0.7725003957748413, "rewards/confidence_one_or_zero": 0.01171875, "rewards/format_reward": 0.99951171875, "rewards/mean_confidence_reward": 0.41514649987220764, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 171.05126953125, "completions/mean_terminated_length": 171.05126953125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.704, "grad_norm": 0.0025645701680332422, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 183697854.0, "reward": 1.1110807657241821, "reward_std": 0.11133909225463867, "rewards/accuracy_reward": 0.45947265625, "rewards/brier_reward": 0.7631762027740479, "rewards/confidence_one_or_zero": 0.0224609375, "rewards/format_reward": 0.99951171875, "rewards/mean_confidence_reward": 0.42967772483825684, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 171.7705078125, "completions/mean_terminated_length": 171.7705078125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.7168, "grad_norm": 0.0012854809174314141, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 187083480.0, "reward": 1.1149908304214478, "reward_std": 0.11459929496049881, "rewards/accuracy_reward": 0.4580078125, "rewards/brier_reward": 0.7744144797325134, "rewards/confidence_one_or_zero": 0.0146484375, "rewards/format_reward": 0.99755859375, "rewards/mean_confidence_reward": 0.4292285144329071, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 171.90771484375, "completions/mean_terminated_length": 171.90771484375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.7296, "grad_norm": 0.0018713475437834859, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 190385139.0, "reward": 1.1170523166656494, "reward_std": 0.11444520950317383, "rewards/accuracy_reward": 0.4619140625, "rewards/brier_reward": 0.772189736366272, "rewards/confidence_one_or_zero": 0.02001953125, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.4356445372104645, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 170.96044921875, "completions/mean_terminated_length": 170.96044921875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.7424, "grad_norm": 0.0026220239233225584, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 193783130.0, "reward": 1.1173450946807861, "reward_std": 0.11841192841529846, "rewards/accuracy_reward": 0.46337890625, "rewards/brier_reward": 0.7713105082511902, "rewards/confidence_one_or_zero": 0.01416015625, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.4385010004043579, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 173.064453125, "completions/mean_terminated_length": 173.064453125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.7552, "grad_norm": 0.0012924442999064922, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 197140494.0, "reward": 1.105983018875122, "reward_std": 0.107911616563797, "rewards/accuracy_reward": 0.43505859375, "rewards/brier_reward": 0.7773948907852173, "rewards/confidence_one_or_zero": 0.015625, "rewards/format_reward": 0.99951171875, "rewards/mean_confidence_reward": 0.44759440422058105, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 176.484375, "completions/mean_terminated_length": 176.07034301757812, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.768, "grad_norm": 0.0028388705104589462, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 200495046.0, "reward": 1.110207438468933, "reward_std": 0.11490217596292496, "rewards/accuracy_reward": 0.44677734375, "rewards/brier_reward": 0.7746132612228394, "rewards/confidence_one_or_zero": 0.01171875, "rewards/format_reward": 0.9990234375, "rewards/mean_confidence_reward": 0.45084962248802185, "step": 60 }, { "epoch": 0.768, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 340.5, "eval_completions/max_terminated_length": 340.5, "eval_completions/mean_length": 177.17342376708984, "eval_completions/mean_terminated_length": 177.17342376708984, "eval_completions/min_length": 83.5, "eval_completions/min_terminated_length": 83.5, "eval_loss": 0.0, "eval_num_tokens": 200495046.0, "eval_reward": 1.060145229101181, "eval_reward_std": 0.23021632805466652, "eval_rewards/accuracy_reward": 0.3515625, "eval_rewards/brier_reward": 0.7706802636384964, "eval_rewards/confidence_one_or_zero": 0.013671875, "eval_rewards/format_reward": 0.998046875, "eval_rewards/mean_confidence_reward": 0.44810547679662704, "eval_runtime": 35.5968, "eval_samples_per_second": 14.046, "eval_steps_per_second": 0.112, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 175.306640625, "completions/mean_terminated_length": 175.306640625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.7808, "grad_norm": 0.0019142339006066322, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 203882754.0, "reward": 1.1105574369430542, "reward_std": 0.1143236756324768, "rewards/accuracy_reward": 0.44970703125, "rewards/brier_reward": 0.771406888961792, "rewards/confidence_one_or_zero": 0.01416015625, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.44986817240715027, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 175.62744140625, "completions/mean_terminated_length": 175.62744140625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.7936, "grad_norm": 0.0028708542231470346, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 207246823.0, "reward": 1.115567922592163, "reward_std": 0.11952622979879379, "rewards/accuracy_reward": 0.4619140625, "rewards/brier_reward": 0.770685613155365, "rewards/confidence_one_or_zero": 0.01708984375, "rewards/format_reward": 0.99853515625, "rewards/mean_confidence_reward": 0.4633447527885437, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 171.20751953125, "completions/mean_terminated_length": 171.20751953125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.8064, "grad_norm": 0.013748224824666977, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 210614152.0, "reward": 1.1377573013305664, "reward_std": 0.10060383379459381, "rewards/accuracy_reward": 0.50634765625, "rewards/brier_reward": 0.769166111946106, "rewards/confidence_one_or_zero": 0.01708984375, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.47361326217651367, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 173.845703125, "completions/mean_terminated_length": 173.845703125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.8192, "grad_norm": 0.0016836629947647452, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 213986532.0, "reward": 1.1204241514205933, "reward_std": 0.12249487638473511, "rewards/accuracy_reward": 0.466796875, "rewards/brier_reward": 0.7740505933761597, "rewards/confidence_one_or_zero": 0.01806640625, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.4648388624191284, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 175.62158203125, "completions/mean_terminated_length": 175.62158203125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.832, "grad_norm": 0.008333772420883179, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 217353213.0, "reward": 1.1017959117889404, "reward_std": 0.11205719411373138, "rewards/accuracy_reward": 0.43310546875, "rewards/brier_reward": 0.7714620232582092, "rewards/confidence_one_or_zero": 0.0166015625, "rewards/format_reward": 0.9990234375, "rewards/mean_confidence_reward": 0.46193361282348633, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 174.36376953125, "completions/mean_terminated_length": 174.36376953125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.8448, "grad_norm": 0.0011087502352893353, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 220702790.0, "reward": 1.1077888011932373, "reward_std": 0.11732746660709381, "rewards/accuracy_reward": 0.443359375, "rewards/brier_reward": 0.773193895816803, "rewards/confidence_one_or_zero": 0.0146484375, "rewards/format_reward": 0.9990234375, "rewards/mean_confidence_reward": 0.4771631062030792, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 180.34814453125, "completions/mean_terminated_length": 180.34814453125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.8576, "grad_norm": 0.0014937082305550575, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 224068031.0, "reward": 1.111690640449524, "reward_std": 0.10702188313007355, "rewards/accuracy_reward": 0.44921875, "rewards/brier_reward": 0.774161696434021, "rewards/confidence_one_or_zero": 0.0107421875, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.47103172540664673, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 179.4482421875, "completions/mean_terminated_length": 179.4482421875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.8704, "grad_norm": 0.003253382397815585, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 227500213.0, "reward": 1.1289840936660767, "reward_std": 0.11423557996749878, "rewards/accuracy_reward": 0.4765625, "rewards/brier_reward": 0.7823812365531921, "rewards/confidence_one_or_zero": 0.01513671875, "rewards/format_reward": 0.9990234375, "rewards/mean_confidence_reward": 0.4756738245487213, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 177.3544921875, "completions/mean_terminated_length": 177.3544921875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.8832, "grad_norm": 0.000763367279432714, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 230841907.0, "reward": 1.0981295108795166, "reward_std": 0.11358843743801117, "rewards/accuracy_reward": 0.41943359375, "rewards/brier_reward": 0.7768245935440063, "rewards/confidence_one_or_zero": 0.015625, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.4759277403354645, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 178.068359375, "completions/mean_terminated_length": 177.6551055908203, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.896, "grad_norm": 0.000891282397788018, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 234239855.0, "reward": 1.1204962730407715, "reward_std": 0.11121779680252075, "rewards/accuracy_reward": 0.4697265625, "rewards/brier_reward": 0.7717532515525818, "rewards/confidence_one_or_zero": 0.01318359375, "rewards/format_reward": 0.99951171875, "rewards/mean_confidence_reward": 0.48045408725738525, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 181.03125, "completions/mean_terminated_length": 181.03125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.9088, "grad_norm": 0.0010501582873985171, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 237612871.0, "reward": 1.1125749349594116, "reward_std": 0.11690981686115265, "rewards/accuracy_reward": 0.45654296875, "rewards/brier_reward": 0.7695823907852173, "rewards/confidence_one_or_zero": 0.01025390625, "rewards/format_reward": 0.9990234375, "rewards/mean_confidence_reward": 0.47194337844848633, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 179.0400390625, "completions/mean_terminated_length": 179.0400390625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.9216, "grad_norm": 0.0008443333790637553, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 241019905.0, "reward": 1.1193747520446777, "reward_std": 0.11691069602966309, "rewards/accuracy_reward": 0.470703125, "rewards/brier_reward": 0.7690220475196838, "rewards/confidence_one_or_zero": 0.02294921875, "rewards/format_reward": 0.9990234375, "rewards/mean_confidence_reward": 0.46311524510383606, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 177.8017578125, "completions/mean_terminated_length": 177.8017578125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.9344, "grad_norm": 0.000890648050699383, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 244393011.0, "reward": 1.0996851921081543, "reward_std": 0.1123179942369461, "rewards/accuracy_reward": 0.41845703125, "rewards/brier_reward": 0.7809123992919922, "rewards/confidence_one_or_zero": 0.02197265625, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.4633203148841858, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 179.6220703125, "completions/mean_terminated_length": 179.6220703125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.9472, "grad_norm": 0.000775814289227128, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 247706293.0, "reward": 1.115156650543213, "reward_std": 0.10829775035381317, "rewards/accuracy_reward": 0.45654296875, "rewards/brier_reward": 0.7737695574760437, "rewards/confidence_one_or_zero": 0.0185546875, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.4643896520137787, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 180.197265625, "completions/mean_terminated_length": 180.197265625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.96, "grad_norm": 0.0024549230001866817, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 251076793.0, "reward": 1.1133463382720947, "reward_std": 0.10567133873701096, "rewards/accuracy_reward": 0.4453125, "rewards/brier_reward": 0.781867504119873, "rewards/confidence_one_or_zero": 0.01123046875, "rewards/format_reward": 0.99951171875, "rewards/mean_confidence_reward": 0.481499046087265, "step": 75 }, { "epoch": 0.96, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 335.25, "eval_completions/max_terminated_length": 335.25, "eval_completions/mean_length": 180.3135108947754, "eval_completions/mean_terminated_length": 180.3135108947754, "eval_completions/min_length": 106.0, "eval_completions/min_terminated_length": 106.0, "eval_loss": 0.0, "eval_num_tokens": 251076793.0, "eval_reward": 1.0895682871341705, "eval_reward_std": 0.24639935791492462, "eval_rewards/accuracy_reward": 0.396484375, "eval_rewards/brier_reward": 0.7826513648033142, "eval_rewards/confidence_one_or_zero": 0.009765625, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.46923828125, "eval_runtime": 35.4002, "eval_samples_per_second": 14.124, "eval_steps_per_second": 0.113, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 182.37158203125, "completions/mean_terminated_length": 181.96043395996094, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.9728, "grad_norm": 0.0009286075364798307, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 254430658.0, "reward": 1.1125905513763428, "reward_std": 0.11536484956741333, "rewards/accuracy_reward": 0.45068359375, "rewards/brier_reward": 0.7749849557876587, "rewards/confidence_one_or_zero": 0.015625, "rewards/format_reward": 0.99951171875, "rewards/mean_confidence_reward": 0.4785693287849426, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 181.07470703125, "completions/mean_terminated_length": 181.07470703125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.9856, "grad_norm": 0.0007356080459430814, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 257823147.0, "reward": 1.113877773284912, "reward_std": 0.1039476990699768, "rewards/accuracy_reward": 0.44091796875, "rewards/brier_reward": 0.7868366837501526, "rewards/confidence_one_or_zero": 0.02685546875, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.4647803008556366, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006377551020407823, "completions/max_length": 1024.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 179.0899200439453, "completions/mean_terminated_length": 178.5507354736328, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.9984, "grad_norm": 0.008926053531467915, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 261189064.0, "reward": 1.1190553903579712, "reward_std": 0.1117812842130661, "rewards/accuracy_reward": 0.466796875, "rewards/brier_reward": 0.772289514541626, "rewards/confidence_one_or_zero": 0.01611328125, "rewards/format_reward": 0.9990234375, "rewards/mean_confidence_reward": 0.4710400402545929, "step": 78 }, { "epoch": 0.9984, "step": 78, "total_flos": 0.0, "train_loss": 0.006661382568404989, "train_runtime": 48105.2196, "train_samples_per_second": 0.416, "train_steps_per_second": 0.002 } ], "logging_steps": 1, "max_steps": 78, "num_input_tokens_seen": 261189064, "num_train_epochs": 1, "save_steps": 30, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }