[ { "loss": 0.0003, "grad_norm": 2.0297584533691406, "learning_rate": 0.0, "num_tokens": 2438.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.014825001358985901, "rewards/env_reward/std": 0.5229976177215576, "rewards/belief_accuracy/mean": 0.16745585203170776, "rewards/belief_accuracy/std": 0.03546026349067688, "reward": 0.5746050477027893, "reward_std": 0.7637181282043457, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.007555788848549128, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008333333333333334, "step": 1 }, { "loss": 0.0147, "grad_norm": 487.28997802734375, "learning_rate": 1.25e-06, "num_tokens": 4754.0, "completions/mean_length": 15.0, "completions/min_length": 9.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 9.333333969116211, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3118250072002411, "rewards/env_reward/std": 0.26973089575767517, "rewards/belief_accuracy/mean": 0.37001365423202515, "rewards/belief_accuracy/std": 0.06095712259411812, "reward": 1.6277785301208496, "reward_std": 0.3852787911891937, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.36721258889883757, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0016666666666666668, "step": 2 }, { "loss": 0.0006, "grad_norm": 11.092467308044434, "learning_rate": 2.5e-06, "num_tokens": 7195.0, "completions/mean_length": 10.25, "completions/min_length": 9.0, "completions/max_length": 12.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 12.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.6929999589920044, "rewards/env_reward/std": 1.5391119718551636, "rewards/belief_accuracy/mean": 0.03502512723207474, "rewards/belief_accuracy/std": 0.24171829223632812, "reward": -0.9344245195388794, "reward_std": 2.935027599334717, "frac_reward_zero_std": 0.0, "completion_length": 12.0, "kl": 0.014069308177568018, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0025, "step": 3 }, { "loss": 0.0067, "grad_norm": 70.31130981445312, "learning_rate": 3.75e-06, "num_tokens": 9641.0, "completions/mean_length": 11.5, "completions/min_length": 9.0, "completions/max_length": 18.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 18.0, "rewards/format_valid/mean": 0.875, "rewards/format_valid/std": 0.25, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.16352500021457672, "rewards/env_reward/std": 0.12513433396816254, "rewards/belief_accuracy/mean": 0.04012066870927811, "rewards/belief_accuracy/std": 0.20485758781433105, "reward": 0.40939950942993164, "reward_std": 0.6117480397224426, "frac_reward_zero_std": 0.0, "completion_length": 18.0, "kl": 0.16654532926622778, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0033333333333333335, "step": 4 }, { "loss": 0.0002, "grad_norm": 5.75938606262207, "learning_rate": 5e-06, "num_tokens": 12081.0, "completions/mean_length": 10.0, "completions/min_length": 9.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3447999954223633, "rewards/env_reward/std": 0.1597922146320343, "rewards/belief_accuracy/mean": 0.2165767103433609, "rewards/belief_accuracy/std": 0.21178469061851501, "reward": 1.2169301509857178, "reward_std": 0.5240868330001831, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.005745900256442837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004166666666666667, "step": 5 }, { "loss": 0.0001, "grad_norm": 6.055194854736328, "learning_rate": 6.25e-06, "num_tokens": 14378.0, "completions/mean_length": 9.25, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.20652499794960022, "rewards/env_reward/std": 0.31724998354911804, "rewards/belief_accuracy/mean": 0.17082196474075317, "rewards/belief_accuracy/std": 0.06307334452867508, "reward": 0.8722533583641052, "reward_std": 0.5229519605636597, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.0013933435111539438, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005, "step": 6 }, { "loss": 0.0002, "grad_norm": 4.596080780029297, "learning_rate": 7.5e-06, "num_tokens": 16817.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.06319999694824219, "rewards/env_reward/std": 0.1783815622329712, "rewards/belief_accuracy/mean": 0.07134255766868591, "rewards/belief_accuracy/std": 0.08802108466625214, "reward": 0.3588276505470276, "reward_std": 0.5061821937561035, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.003983344242442399, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005833333333333334, "step": 7 }, { "loss": 0.0002, "grad_norm": 5.14266300201416, "learning_rate": 8.75e-06, "num_tokens": 19254.0, "completions/mean_length": 9.25, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.08379998803138733, "rewards/env_reward/std": 0.4169001281261444, "rewards/belief_accuracy/mean": 0.25081950426101685, "rewards/belief_accuracy/std": 0.09036833792924881, "reward": 0.9281585216522217, "reward_std": 0.641987144947052, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.0045957728871144354, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.006666666666666667, "step": 8 }, { "loss": 0.0002, "grad_norm": 5.244797229766846, "learning_rate": 1e-05, "num_tokens": 21694.0, "completions/mean_length": 10.0, "completions/min_length": 9.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.7247750163078308, "rewards/env_reward/std": 1.522688627243042, "rewards/belief_accuracy/mean": 0.08231388032436371, "rewards/belief_accuracy/std": 0.22892379760742188, "reward": -0.8402208089828491, "reward_std": 2.993889331817627, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.004008802643511444, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0075, "step": 9 }, { "loss": 0.0011, "grad_norm": 12.427257537841797, "learning_rate": 1.125e-05, "num_tokens": 24151.0, "completions/mean_length": 14.25, "completions/min_length": 10.0, "completions/max_length": 25.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 25.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1392499953508377, "rewards/env_reward/std": 0.016723934561014175, "rewards/belief_accuracy/mean": 0.1579858809709549, "rewards/belief_accuracy/std": 0.0883205309510231, "reward": 0.732832670211792, "reward_std": 0.2891167104244232, "frac_reward_zero_std": 0.0, "completion_length": 25.0, "kl": 0.028211204218678176, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.008333333333333333, "step": 10 }, { "loss": 0.0002, "grad_norm": 3.1559700965881348, "learning_rate": 1.25e-05, "num_tokens": 25878.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.5226500034332275, "rewards/env_reward/std": 1.6547037363052368, "rewards/belief_accuracy/mean": 0.16303499042987823, "rewards/belief_accuracy/std": 0.24976226687431335, "reward": -0.2948700189590454, "reward_std": 3.3204774856567383, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.005673486019077245, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.009166666666666667, "step": 11 }, { "loss": 0.0002, "grad_norm": 3.764303207397461, "learning_rate": 1.3750000000000002e-05, "num_tokens": 27605.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.023450002074241638, "rewards/env_reward/std": 0.07271481305360794, "rewards/belief_accuracy/mean": 0.22252154350280762, "rewards/belief_accuracy/std": 0.036789700388908386, "reward": 0.6823896169662476, "reward_std": 0.017910229042172432, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.004842391535930801, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.01, "step": 12 }, { "loss": 0.0002, "grad_norm": 5.114142894744873, "learning_rate": 1.5e-05, "num_tokens": 30061.0, "completions/mean_length": 14.0, "completions/min_length": 10.0, "completions/max_length": 25.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 25.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.057100001722574234, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.23391437530517578, "rewards/belief_accuracy/std": 0.1463327705860138, "reward": 0.8373932242393494, "reward_std": 0.4389983117580414, "frac_reward_zero_std": 0.0, "completion_length": 25.0, "kl": 0.004877378873061389, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.010833333333333334, "step": 13 }, { "loss": 0.0001, "grad_norm": 3.0929784774780273, "learning_rate": 1.6250000000000002e-05, "num_tokens": 32501.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.29087501764297485, "rewards/env_reward/std": 0.7713783979415894, "rewards/belief_accuracy/mean": 0.19772300124168396, "rewards/belief_accuracy/std": 0.1775362491607666, "reward": 0.20685642957687378, "reward_std": 0.6949325203895569, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.0014771804580959724, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.011666666666666667, "step": 14 }, { "loss": 0.0003, "grad_norm": 6.996659755706787, "learning_rate": 1.75e-05, "num_tokens": 34939.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.25892502069473267, "rewards/env_reward/std": 0.1236146092414856, "rewards/belief_accuracy/mean": 0.18266388773918152, "rewards/belief_accuracy/std": 0.0645347312092781, "reward": 0.9863791465759277, "reward_std": 0.1628992110490799, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.007067542013828643, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0125, "step": 15 }, { "loss": 0.0002, "grad_norm": 5.10272216796875, "learning_rate": 1.8750000000000002e-05, "num_tokens": 37377.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2574499845504761, "rewards/env_reward/std": 0.2165336161851883, "rewards/belief_accuracy/mean": 0.16352561116218567, "rewards/belief_accuracy/std": 0.09197874367237091, "reward": 0.9267517924308777, "reward_std": 0.1699640452861786, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.004334542165452149, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.013333333333333334, "step": 16 }, { "loss": 0.0002, "grad_norm": 4.821376800537109, "learning_rate": 2e-05, "num_tokens": 39815.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.9602500200271606, "rewards/env_reward/std": 1.3646483421325684, "rewards/belief_accuracy/mean": 0.08026264607906342, "rewards/belief_accuracy/std": 0.1924673169851303, "reward": -1.199587106704712, "reward_std": 2.7017223834991455, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.004073493357282132, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.014166666666666666, "step": 17 }, { "loss": 0.0002, "grad_norm": 4.039498805999756, "learning_rate": 2.125e-05, "num_tokens": 42253.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3628250062465668, "rewards/env_reward/std": 0.42794182896614075, "rewards/belief_accuracy/mean": 0.1981642246246338, "rewards/belief_accuracy/std": 0.08274012058973312, "reward": 1.1887301206588745, "reward_std": 0.5090645551681519, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.003853246627841145, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.015, "step": 18 }, { "loss": 0.0001, "grad_norm": 8.75765609741211, "learning_rate": 2.25e-05, "num_tokens": 44693.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5987499952316284, "rewards/env_reward/std": 0.9246196746826172, "rewards/belief_accuracy/mean": 0.1160651445388794, "rewards/belief_accuracy/std": 0.10867781937122345, "reward": 1.2963204383850098, "reward_std": 1.6085225343704224, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.002594503777800128, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.015833333333333335, "step": 19 }, { "loss": 0.0001, "grad_norm": 3.9152791500091553, "learning_rate": 2.375e-05, "num_tokens": 46624.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.18017500638961792, "rewards/env_reward/std": 0.15538449585437775, "rewards/belief_accuracy/mean": 0.09383687376976013, "rewards/belief_accuracy/std": 0.03703703731298447, "reward": 0.6017731428146362, "reward_std": 0.15383398532867432, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.0029762745389234624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.016666666666666666, "step": 20 }, { "loss": 0.0002, "grad_norm": 4.0644450187683105, "learning_rate": 2.5e-05, "num_tokens": 49066.0, "completions/mean_length": 10.5, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.5944499969482422, "rewards/env_reward/std": 1.6077580451965332, "rewards/belief_accuracy/mean": 0.122025266289711, "rewards/belief_accuracy/std": 0.2217501848936081, "reward": -0.5255992412567139, "reward_std": 3.1553311347961426, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.00437403135583736, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0175, "step": 21 }, { "loss": 0.0001, "grad_norm": 3.2595951557159424, "learning_rate": 2.625e-05, "num_tokens": 51505.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.08552499860525131, "rewards/env_reward/std": 0.2797339856624603, "rewards/belief_accuracy/mean": 0.17279182374477386, "rewards/belief_accuracy/std": 0.07431290298700333, "reward": 0.4400880038738251, "reward_std": 0.5773349404335022, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.0019797176646534353, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.018333333333333333, "step": 22 }, { "loss": 0.1212, "grad_norm": 736.62890625, "learning_rate": 2.7500000000000004e-05, "num_tokens": 53966.0, "completions/mean_length": 15.25, "completions/min_length": 9.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 9.666666984558105, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1115499958395958, "rewards/env_reward/std": 0.04209999740123749, "rewards/belief_accuracy/mean": 0.2777556777000427, "rewards/belief_accuracy/std": 0.10405799001455307, "reward": 1.050592064857483, "reward_std": 0.2744295299053192, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 3.031015553511679, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.019166666666666665, "step": 23 }, { "loss": 0.0007, "grad_norm": 4.14186954498291, "learning_rate": 2.8749999999999997e-05, "num_tokens": 56407.0, "completions/mean_length": 10.25, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.4265500009059906, "rewards/env_reward/std": 0.7103733420372009, "rewards/belief_accuracy/mean": 0.16075819730758667, "rewards/belief_accuracy/std": 0.09638070315122604, "reward": -0.10755039006471634, "reward_std": 1.0050357580184937, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.01767307601403445, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02, "step": 24 }, { "loss": 0.0002, "grad_norm": 4.760839462280273, "learning_rate": 3e-05, "num_tokens": 58846.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.5765249729156494, "rewards/env_reward/std": 1.6289700269699097, "rewards/belief_accuracy/mean": 0.08613643050193787, "rewards/belief_accuracy/std": 0.20087628066539764, "reward": -0.6063781976699829, "reward_std": 3.0987460613250732, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.004387545719509944, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.020833333333333332, "step": 25 }, { "loss": 0.0003, "grad_norm": 4.131656646728516, "learning_rate": 3.125e-05, "num_tokens": 61285.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5048750042915344, "rewards/env_reward/std": 0.30701497197151184, "rewards/belief_accuracy/mean": 0.22258360683918, "rewards/belief_accuracy/std": 0.0710342675447464, "reward": 1.4750633239746094, "reward_std": 0.6433582305908203, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.006817424291511998, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.021666666666666667, "step": 26 }, { "loss": 0.0005, "grad_norm": 4.655338764190674, "learning_rate": 3.2500000000000004e-05, "num_tokens": 63724.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.47574999928474426, "rewards/env_reward/std": 0.4241226613521576, "rewards/belief_accuracy/mean": 0.21906425058841705, "rewards/belief_accuracy/std": 0.18924623727798462, "reward": 1.4208177328109741, "reward_std": 1.19350266456604, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.011261592793744057, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0225, "step": 27 }, { "loss": 0.1623, "grad_norm": 35.8610725402832, "learning_rate": 3.375000000000001e-05, "num_tokens": 65857.0, "completions/mean_length": 15.25, "completions/min_length": 9.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 9.666666984558105, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.31607499718666077, "rewards/env_reward/std": 0.7444266080856323, "rewards/belief_accuracy/mean": 0.22687393426895142, "rewards/belief_accuracy/std": 0.07754969596862793, "reward": 0.2565092444419861, "reward_std": 0.9514979124069214, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 4.056387651711702, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.023333333333333334, "step": 28 }, { "loss": 0.0027, "grad_norm": 3.129375696182251, "learning_rate": 3.5e-05, "num_tokens": 68295.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5361000299453735, "rewards/env_reward/std": 0.2634325325489044, "rewards/belief_accuracy/mean": 0.23782525956630707, "rewards/belief_accuracy/std": 0.16049730777740479, "reward": 1.5676257610321045, "reward_std": 0.5401614904403687, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.06665351914125495, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.024166666666666666, "step": 29 }, { "loss": 0.0038, "grad_norm": 7.313047885894775, "learning_rate": 3.625e-05, "num_tokens": 70592.0, "completions/mean_length": 9.25, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3769500255584717, "rewards/env_reward/std": 0.1939132809638977, "rewards/belief_accuracy/mean": 0.25660422444343567, "rewards/belief_accuracy/std": 0.030240608379244804, "reward": 1.385237693786621, "reward_std": 0.23877514898777008, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.09575654342916096, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.025, "step": 30 }, { "loss": 0.007, "grad_norm": 302.0821838378906, "learning_rate": 3.7500000000000003e-05, "num_tokens": 73036.0, "completions/mean_length": 11.0, "completions/min_length": 9.0, "completions/max_length": 15.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.0, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 15.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.7235249876976013, "rewards/env_reward/std": 1.5176500082015991, "rewards/belief_accuracy/mean": 0.08568151295185089, "rewards/belief_accuracy/std": 0.19252440333366394, "reward": -0.8282430171966553, "reward_std": 2.9490480422973633, "frac_reward_zero_std": 0.0, "completion_length": 15.0, "kl": 0.17512649775017053, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.025833333333333333, "step": 31 }, { "loss": 0.3763, "grad_norm": 78.20838165283203, "learning_rate": 3.875e-05, "num_tokens": 75498.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.8036249876022339, "rewards/env_reward/std": 1.5019314289093018, "rewards/belief_accuracy/mean": -0.07945596426725388, "rewards/belief_accuracy/std": 0.0959252119064331, "reward": -1.4438053369522095, "reward_std": 2.5607211589813232, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 9.407063644379377, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02666666666666667, "step": 32 }, { "loss": 0.0028, "grad_norm": 3.190960168838501, "learning_rate": 4e-05, "num_tokens": 77225.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5138000249862671, "rewards/env_reward/std": 0.23183931410312653, "rewards/belief_accuracy/mean": 0.25149983167648315, "rewards/belief_accuracy/std": 0.07094359397888184, "reward": 1.5751994848251343, "reward_std": 0.5590165257453918, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.06879310176009312, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0275, "step": 33 }, { "loss": 0.0168, "grad_norm": 28.640579223632812, "learning_rate": 4.125e-05, "num_tokens": 79700.0, "completions/mean_length": 18.75, "completions/min_length": 9.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 14.333333969116211, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 24.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.2737500071525574, "rewards/env_reward/std": 1.855374813079834, "rewards/belief_accuracy/mean": 0.2497768998146057, "rewards/belief_accuracy/std": 0.30190309882164, "reward": 0.3387056589126587, "reward_std": 3.7822914123535156, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.42012222670018673, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.028333333333333332, "step": 34 }, { "loss": 0.006, "grad_norm": 5.623953342437744, "learning_rate": 4.25e-05, "num_tokens": 82138.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.23285000026226044, "rewards/env_reward/std": 0.23289161920547485, "rewards/belief_accuracy/mean": 0.25250691175460815, "rewards/belief_accuracy/std": 0.1582489311695099, "reward": 0.45824581384658813, "reward_std": 0.7270535230636597, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.14891700155567378, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.029166666666666667, "step": 35 }, { "loss": 0.0102, "grad_norm": 5.508930683135986, "learning_rate": 4.375e-05, "num_tokens": 84578.0, "completions/mean_length": 10.0, "completions/min_length": 9.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.22097501158714294, "rewards/env_reward/std": 0.6560747623443604, "rewards/belief_accuracy/mean": 0.24804989993572235, "rewards/belief_accuracy/std": 0.12259609997272491, "reward": 0.4626871347427368, "reward_std": 0.8850942254066467, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.2548879988025874, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03, "step": 36 }, { "loss": 0.0053, "grad_norm": 5.627114295959473, "learning_rate": 4.5e-05, "num_tokens": 87019.0, "completions/mean_length": 10.25, "completions/min_length": 9.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": -0.5, "rewards/format_valid/std": 1.7320507764816284, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/env_reward/mean": -1.3700499534606934, "rewards/env_reward/std": 1.8821041584014893, "rewards/belief_accuracy/mean": -0.0548660047352314, "rewards/belief_accuracy/std": 0.19056659936904907, "reward": -2.2696728706359863, "reward_std": 3.4521307945251465, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.132804719673004, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.030833333333333334, "step": 37 }, { "loss": 0.0092, "grad_norm": 5.246164321899414, "learning_rate": 4.6250000000000006e-05, "num_tokens": 89457.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1657249927520752, "rewards/env_reward/std": 0.03991043195128441, "rewards/belief_accuracy/mean": 0.29197028279304504, "rewards/belief_accuracy/std": 0.12365185469388962, "reward": 1.1744983196258545, "reward_std": 0.3355043828487396, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2298137085745111, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03166666666666667, "step": 38 }, { "loss": 0.097, "grad_norm": 12.651698112487793, "learning_rate": 4.75e-05, "num_tokens": 91918.0, "completions/mean_length": 15.25, "completions/min_length": 9.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 9.666666984558105, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.8153500556945801, "rewards/env_reward/std": 0.9287000298500061, "rewards/belief_accuracy/mean": 0.13362497091293335, "rewards/belief_accuracy/std": 0.1170458197593689, "reward": -0.7721501588821411, "reward_std": 1.5534520149230957, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 2.4256066400557756, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0325, "step": 39 }, { "loss": 0.0051, "grad_norm": 7.508624076843262, "learning_rate": 4.875e-05, "num_tokens": 93644.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4228000044822693, "rewards/env_reward/std": 0.6864694952964783, "rewards/belief_accuracy/mean": 0.3421034812927246, "rewards/belief_accuracy/std": 0.03489655256271362, "reward": 1.710510492324829, "reward_std": 0.9325205683708191, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.12869007809786126, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03333333333333333, "step": 40 }, { "loss": 0.0096, "grad_norm": 4.437275409698486, "learning_rate": 5e-05, "num_tokens": 96082.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.345674991607666, "rewards/env_reward/std": 0.2988019585609436, "rewards/belief_accuracy/mean": 0.1830797791481018, "rewards/belief_accuracy/std": 0.0974152609705925, "reward": 1.1177518367767334, "reward_std": 0.4233185946941376, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.23907954257447273, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.034166666666666665, "step": 41 }, { "loss": 0.0068, "grad_norm": 4.4014458656311035, "learning_rate": 4.986111111111111e-05, "num_tokens": 98197.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.16957500576972961, "rewards/env_reward/std": 0.7329205870628357, "rewards/belief_accuracy/mean": 0.05849887803196907, "rewards/belief_accuracy/std": 0.0555555559694767, "reward": -0.028865892440080643, "reward_std": 1.0444624423980713, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.1710746451281011, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.035, "step": 42 }, { "loss": 0.0018, "grad_norm": 3.1398186683654785, "learning_rate": 4.972222222222223e-05, "num_tokens": 100633.0, "completions/mean_length": 9.0, "completions/min_length": 9.0, "completions/max_length": 9.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.0, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 9.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.3562999963760376, "rewards/env_reward/std": 0.1712000072002411, "rewards/belief_accuracy/mean": 0.18461482226848602, "rewards/belief_accuracy/std": 0.1464960277080536, "reward": 0.06939443945884705, "reward_std": 0.36057770252227783, "frac_reward_zero_std": 0.0, "completion_length": 9.0, "kl": 0.044276596046984196, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.035833333333333335, "step": 43 }, { "loss": 0.0148, "grad_norm": 4.34901237487793, "learning_rate": 4.958333333333334e-05, "num_tokens": 103070.0, "completions/mean_length": 9.25, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.06962499022483826, "rewards/env_reward/std": 0.2918499708175659, "rewards/belief_accuracy/mean": 0.28115594387054443, "rewards/belief_accuracy/std": 0.12468524277210236, "reward": 0.9979052543640137, "reward_std": 0.08536310493946075, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3707070527598262, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03666666666666667, "step": 44 }, { "loss": 0.011, "grad_norm": 2.1270384788513184, "learning_rate": 4.9444444444444446e-05, "num_tokens": 105367.0, "completions/mean_length": 9.25, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.34357500076293945, "rewards/env_reward/std": 0.07905000448226929, "rewards/belief_accuracy/mean": 0.2644440233707428, "rewards/belief_accuracy/std": 0.05163230374455452, "reward": 1.3586945533752441, "reward_std": 0.09579288214445114, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.27486721728928387, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0375, "step": 45 }, { "loss": 0.014, "grad_norm": 11.598420143127441, "learning_rate": 4.930555555555556e-05, "num_tokens": 107809.0, "completions/mean_length": 10.5, "completions/min_length": 9.0, "completions/max_length": 12.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 12.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.5223749876022339, "rewards/env_reward/std": 1.6578657627105713, "rewards/belief_accuracy/mean": 0.06579001247882843, "rewards/belief_accuracy/std": 0.18941286206245422, "reward": -0.5861924886703491, "reward_std": 3.1335580348968506, "frac_reward_zero_std": 0.0, "completion_length": 12.0, "kl": 0.35064559197053313, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03833333333333333, "step": 46 }, { "loss": 0.0073, "grad_norm": 3.6913492679595947, "learning_rate": 4.9166666666666665e-05, "num_tokens": 110248.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3232000172138214, "rewards/env_reward/std": 0.1860000044107437, "rewards/belief_accuracy/mean": 0.302872896194458, "rewards/belief_accuracy/std": 0.07409624010324478, "reward": 1.4434185028076172, "reward_std": 0.21208810806274414, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.18329114187508821, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03916666666666667, "step": 47 }, { "loss": 0.0267, "grad_norm": 7.727506160736084, "learning_rate": 4.902777777777778e-05, "num_tokens": 112690.0, "completions/mean_length": 10.5, "completions/min_length": 10.0, "completions/max_length": 12.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 12.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4395500123500824, "rewards/env_reward/std": 0.4166736900806427, "rewards/belief_accuracy/mean": 0.23232725262641907, "rewards/belief_accuracy/std": 0.16305728256702423, "reward": 1.4063067436218262, "reward_std": 0.5394194722175598, "frac_reward_zero_std": 0.0, "completion_length": 12.0, "kl": 0.6681761667132378, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04, "step": 48 }, { "loss": 0.013, "grad_norm": 4.620209217071533, "learning_rate": 4.888888888888889e-05, "num_tokens": 115127.0, "completions/mean_length": 9.25, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.21597501635551453, "rewards/env_reward/std": 0.0036500021815299988, "rewards/belief_accuracy/mean": 0.2432861626148224, "rewards/belief_accuracy/std": 0.13326476514339447, "reward": 0.45589596033096313, "reward_std": 0.40303459763526917, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.32542235124856234, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04083333333333333, "step": 49 }, { "loss": 0.071, "grad_norm": 15.720799446105957, "learning_rate": 4.875e-05, "num_tokens": 117585.0, "completions/mean_length": 14.5, "completions/min_length": 9.0, "completions/max_length": 28.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 28.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.743025004863739, "rewards/env_reward/std": 1.5051672458648682, "rewards/belief_accuracy/mean": 0.14229519665241241, "rewards/belief_accuracy/std": 0.23121023178100586, "reward": -0.6876518726348877, "reward_std": 3.0439605712890625, "frac_reward_zero_std": 0.0, "completion_length": 28.0, "kl": 1.7742180861532688, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.041666666666666664, "step": 50 }, { "loss": 0.007, "grad_norm": 5.957500457763672, "learning_rate": 4.8611111111111115e-05, "num_tokens": 119516.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6230250000953674, "rewards/env_reward/std": 0.24345001578330994, "rewards/belief_accuracy/mean": 0.3019692301750183, "rewards/belief_accuracy/std": 0.07230889797210693, "reward": 1.8904452323913574, "reward_std": 0.34035390615463257, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.17545690201222897, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0425, "step": 51 }, { "loss": 0.0138, "grad_norm": 5.6418776512146, "learning_rate": 4.8472222222222224e-05, "num_tokens": 121954.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.20785000920295715, "rewards/env_reward/std": 0.14000745117664337, "rewards/belief_accuracy/mean": 0.24578773975372314, "rewards/belief_accuracy/std": 0.12300436198711395, "reward": 0.47558823227882385, "reward_std": 0.17540638148784637, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3459396343678236, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.043333333333333335, "step": 52 }, { "loss": 0.0132, "grad_norm": 6.052071571350098, "learning_rate": 4.8333333333333334e-05, "num_tokens": 124393.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.027125000953674316, "rewards/env_reward/std": 0.1343500018119812, "rewards/belief_accuracy/mean": 0.34039586782455444, "rewards/belief_accuracy/std": 0.07877691090106964, "reward": 1.111875057220459, "reward_std": 0.417526513338089, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3293638424947858, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04416666666666667, "step": 53 }, { "loss": 0.005, "grad_norm": 5.564804553985596, "learning_rate": 4.819444444444445e-05, "num_tokens": 126833.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.5870499610900879, "rewards/env_reward/std": 1.6088296175003052, "rewards/belief_accuracy/mean": 0.1694604456424713, "rewards/belief_accuracy/std": 0.25362879037857056, "reward": -0.37219369411468506, "reward_std": 3.258995294570923, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.12563097476959229, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.045, "step": 54 }, { "loss": 0.0198, "grad_norm": 35.32516860961914, "learning_rate": 4.805555555555556e-05, "num_tokens": 129266.0, "completions/mean_length": 8.25, "completions/min_length": 5.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 8.25, "completions/min_terminated_length": 5.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 0.875, "rewards/format_valid/std": 0.25, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.0349000059068203, "rewards/env_reward/std": 0.3493712842464447, "rewards/belief_accuracy/mean": 0.1392139196395874, "rewards/belief_accuracy/std": 0.24298755824565887, "reward": 0.5137417316436768, "reward_std": 1.1124905347824097, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4959399476647377, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04583333333333333, "step": 55 }, { "loss": 0.0235, "grad_norm": 3.5689568519592285, "learning_rate": 4.791666666666667e-05, "num_tokens": 131704.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.0639750063419342, "rewards/env_reward/std": 0.37699612975120544, "rewards/belief_accuracy/mean": 0.20113441348075867, "rewards/belief_accuracy/std": 0.06771832704544067, "reward": 0.5574407577514648, "reward_std": 0.4707197844982147, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5865896865725517, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04666666666666667, "step": 56 }, { "loss": 0.0152, "grad_norm": 4.44246768951416, "learning_rate": 4.7777777777777784e-05, "num_tokens": 134142.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.13779999315738678, "rewards/env_reward/std": 0.4736187756061554, "rewards/belief_accuracy/mean": 0.24740056693553925, "rewards/belief_accuracy/std": 0.19466152787208557, "reward": 0.5855016708374023, "reward_std": 0.5364238023757935, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3799317330121994, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0475, "step": 57 }, { "loss": 0.0146, "grad_norm": 2.86322021484375, "learning_rate": 4.7638888888888887e-05, "num_tokens": 136072.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2609499990940094, "rewards/env_reward/std": 0.06991129368543625, "rewards/belief_accuracy/mean": 0.2346869856119156, "rewards/belief_accuracy/std": 0.0894273892045021, "reward": 1.1454858779907227, "reward_std": 0.34598207473754883, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3641611896455288, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04833333333333333, "step": 58 }, { "loss": 0.1121, "grad_norm": 45.849605560302734, "learning_rate": 4.75e-05, "num_tokens": 138533.0, "completions/mean_length": 15.25, "completions/min_length": 9.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 9.666666984558105, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4922500252723694, "rewards/env_reward/std": 0.1800999939441681, "rewards/belief_accuracy/mean": 0.32485654950141907, "rewards/belief_accuracy/std": 0.14582888782024384, "reward": 1.7629446983337402, "reward_std": 0.3849683701992035, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 2.8015664890408516, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.049166666666666664, "step": 59 }, { "loss": 0.0063, "grad_norm": 4.147298336029053, "learning_rate": 4.736111111111111e-05, "num_tokens": 140974.0, "completions/mean_length": 10.25, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.19417500495910645, "rewards/env_reward/std": 0.9323619604110718, "rewards/belief_accuracy/mean": 0.22329160571098328, "rewards/belief_accuracy/std": 0.2285340428352356, "reward": 1.0111374855041504, "reward_std": 1.9640129804611206, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.15783802792429924, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.05, "step": 60 }, { "loss": 0.0268, "grad_norm": 5.107398509979248, "learning_rate": 4.722222222222222e-05, "num_tokens": 143417.0, "completions/mean_length": 10.75, "completions/min_length": 9.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.11292499303817749, "rewards/env_reward/std": 0.02424999698996544, "rewards/belief_accuracy/mean": 0.2869609594345093, "rewards/belief_accuracy/std": 0.03287558630108833, "reward": 0.7414954900741577, "reward_std": 0.12289471924304962, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 0.6699433820322156, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.050833333333333335, "step": 61 }, { "loss": 0.0086, "grad_norm": 1.9671472311019897, "learning_rate": 4.708333333333334e-05, "num_tokens": 145857.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.8258000016212463, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.1922764927148819, "rewards/belief_accuracy/std": 0.07349060475826263, "reward": 1.8655295372009277, "reward_std": 0.22047176957130432, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.21522311307489872, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.051666666666666666, "step": 62 }, { "loss": 0.3014, "grad_norm": 46.615821838378906, "learning_rate": 4.6944444444444446e-05, "num_tokens": 148317.0, "completions/mean_length": 15.0, "completions/min_length": 9.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 9.333333969116211, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.38324999809265137, "rewards/env_reward/std": 1.7579199075698853, "rewards/belief_accuracy/mean": 0.16348761320114136, "rewards/belief_accuracy/std": 0.24438007175922394, "reward": -0.0844120979309082, "reward_std": 3.4652018547058105, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 7.5339599046856165, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0525, "step": 63 }, { "loss": 0.0125, "grad_norm": 3.313786506652832, "learning_rate": 4.6805555555555556e-05, "num_tokens": 150756.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.33502501249313354, "rewards/env_reward/std": 0.1946970373392105, "rewards/belief_accuracy/mean": 0.2965036630630493, "rewards/belief_accuracy/std": 0.015631647780537605, "reward": 1.4420485496520996, "reward_std": 0.30519527196884155, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.31290814094245434, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.05333333333333334, "step": 64 }, { "loss": 0.0111, "grad_norm": 4.699985980987549, "learning_rate": 4.666666666666667e-05, "num_tokens": 153195.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2145249992609024, "rewards/env_reward/std": 0.4134778380393982, "rewards/belief_accuracy/mean": 0.09438945353031158, "rewards/belief_accuracy/std": 0.07635381072759628, "reward": 0.6549558639526367, "reward_std": 0.4356115162372589, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.27691997960209846, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.05416666666666667, "step": 65 }, { "loss": 0.0094, "grad_norm": 2.48840594291687, "learning_rate": 4.652777777777778e-05, "num_tokens": 155634.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.17452499270439148, "rewards/env_reward/std": 0.19420935213565826, "rewards/belief_accuracy/mean": 0.2777777910232544, "rewards/belief_accuracy/std": 0.06283816695213318, "reward": 1.1451208591461182, "reward_std": 0.143072709441185, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.23591649159789085, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.055, "step": 66 }, { "loss": 0.0104, "grad_norm": 5.279496669769287, "learning_rate": 4.638888888888889e-05, "num_tokens": 158072.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.0007500015199184418, "rewards/env_reward/std": 0.0485551580786705, "rewards/belief_accuracy/mean": 0.15712279081344604, "rewards/belief_accuracy/std": 0.0946710854768753, "reward": 0.5202434062957764, "reward_std": 0.3491542339324951, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2593200672417879, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.05583333333333333, "step": 67 }, { "loss": 0.0088, "grad_norm": 5.2196364402771, "learning_rate": 4.6250000000000006e-05, "num_tokens": 160511.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2296999990940094, "rewards/env_reward/std": 0.3610000014305115, "rewards/belief_accuracy/mean": 0.2660156190395355, "rewards/belief_accuracy/std": 0.03546026349067688, "reward": 1.1925969123840332, "reward_std": 0.4943404197692871, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.22027479112148285, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.056666666666666664, "step": 68 }, { "loss": 0.0126, "grad_norm": 2.0931131839752197, "learning_rate": 4.6111111111111115e-05, "num_tokens": 162950.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3617500066757202, "rewards/env_reward/std": 0.12350000441074371, "rewards/belief_accuracy/mean": 0.2787468433380127, "rewards/belief_accuracy/std": 0.07509276270866394, "reward": 1.4288655519485474, "reward_std": 0.21090544760227203, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.31468209251761436, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0575, "step": 69 }, { "loss": 0.0534, "grad_norm": 4.71396017074585, "learning_rate": 4.5972222222222225e-05, "num_tokens": 165395.0, "completions/mean_length": 11.25, "completions/min_length": 9.0, "completions/max_length": 17.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 17.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1973000019788742, "rewards/env_reward/std": 0.24144788086414337, "rewards/belief_accuracy/mean": 0.30136024951934814, "rewards/belief_accuracy/std": 0.09320782870054245, "reward": 1.2500306367874146, "reward_std": 0.5057908892631531, "frac_reward_zero_std": 0.0, "completion_length": 17.0, "kl": 1.3337756432592869, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.058333333333333334, "step": 70 }, { "loss": 0.0191, "grad_norm": 3.5546116828918457, "learning_rate": 4.5833333333333334e-05, "num_tokens": 167834.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.12457499653100967, "rewards/env_reward/std": 0.02005000039935112, "rewards/belief_accuracy/mean": 0.24463149905204773, "rewards/belief_accuracy/std": 0.12909035384655, "reward": 0.970757007598877, "reward_std": 0.37021511793136597, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4766240194439888, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.059166666666666666, "step": 71 }, { "loss": 0.0421, "grad_norm": 2.085524559020996, "learning_rate": 4.569444444444444e-05, "num_tokens": 170280.0, "completions/mean_length": 11.5, "completions/min_length": 10.0, "completions/max_length": 16.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 16.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4526999890804291, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.191875159740448, "rewards/belief_accuracy/std": 0.0556454099714756, "reward": 1.3046754598617554, "reward_std": 0.1669362485408783, "frac_reward_zero_std": 0.0, "completion_length": 16.0, "kl": 1.051513284444809, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06, "step": 72 }, { "loss": 0.0475, "grad_norm": 17.69377326965332, "learning_rate": 4.555555555555556e-05, "num_tokens": 172602.0, "completions/mean_length": 15.5, "completions/min_length": 9.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": -0.5, "rewards/format_valid/std": 1.7320507764816284, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/env_reward/mean": -1.5393500328063965, "rewards/env_reward/std": 1.6937329769134521, "rewards/belief_accuracy/mean": 0.07886260747909546, "rewards/belief_accuracy/std": 0.32305219769477844, "reward": -2.122437000274658, "reward_std": 3.624743938446045, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 1.1864948254078627, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.060833333333333336, "step": 73 }, { "loss": 0.0076, "grad_norm": 1.4644877910614014, "learning_rate": 4.541666666666667e-05, "num_tokens": 175042.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3749750256538391, "rewards/env_reward/std": 0.08854999393224716, "rewards/belief_accuracy/mean": 0.24918042123317719, "rewards/belief_accuracy/std": 0.07092051953077316, "reward": 1.3600037097930908, "reward_std": 0.1827382892370224, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.1901670265942812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06166666666666667, "step": 74 }, { "loss": 0.0251, "grad_norm": 6.083766937255859, "learning_rate": 4.527777777777778e-05, "num_tokens": 177484.0, "completions/mean_length": 10.5, "completions/min_length": 10.0, "completions/max_length": 12.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 12.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.7234249711036682, "rewards/env_reward/std": 1.5177685022354126, "rewards/belief_accuracy/mean": 0.25676751136779785, "rewards/belief_accuracy/std": 0.30544862151145935, "reward": -0.3148350715637207, "reward_std": 3.2907352447509766, "frac_reward_zero_std": 0.0, "completion_length": 12.0, "kl": 0.6269553881138563, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0625, "step": 75 }, { "loss": 0.0561, "grad_norm": 2.858494997024536, "learning_rate": 4.5138888888888894e-05, "num_tokens": 179923.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2519000172615051, "rewards/env_reward/std": 0.2782000005245209, "rewards/belief_accuracy/mean": 0.1935710310935974, "rewards/belief_accuracy/std": 0.07880179584026337, "reward": 1.0085630416870117, "reward_std": 0.5302945375442505, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 1.4018143992871046, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06333333333333334, "step": 76 }, { "loss": 0.018, "grad_norm": 4.091435432434082, "learning_rate": 4.5e-05, "num_tokens": 182362.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.19592499732971191, "rewards/env_reward/std": 0.4110499918460846, "rewards/belief_accuracy/mean": 0.32113736867904663, "rewards/belief_accuracy/std": 0.06240236759185791, "reward": 0.719524621963501, "reward_std": 0.5247438549995422, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4507467746734619, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06416666666666666, "step": 77 }, { "loss": 0.0253, "grad_norm": 3.242192268371582, "learning_rate": 4.486111111111111e-05, "num_tokens": 184802.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.12035000324249268, "rewards/env_reward/std": 0.0738430991768837, "rewards/belief_accuracy/mean": 0.31365954875946045, "rewards/belief_accuracy/std": 0.16403073072433472, "reward": 1.1715035438537598, "reward_std": 0.5318323373794556, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.6336696594953537, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.065, "step": 78 }, { "loss": 0.0088, "grad_norm": 2.0246036052703857, "learning_rate": 4.472222222222223e-05, "num_tokens": 187242.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.2732999920845032, "rewards/env_reward/std": 0.2833635210990906, "rewards/belief_accuracy/mean": 0.18856000900268555, "rewards/belief_accuracy/std": 0.034971851855516434, "reward": 0.20573002099990845, "reward_std": 0.39767947793006897, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.21958505548536777, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06583333333333333, "step": 79 }, { "loss": 0.0156, "grad_norm": 1.1923185586929321, "learning_rate": 4.458333333333334e-05, "num_tokens": 189682.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.148499995470047, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.366219162940979, "rewards/belief_accuracy/std": 0.04225583001971245, "reward": 0.9259074926376343, "reward_std": 0.12676748633384705, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3896181844174862, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06666666666666667, "step": 80 }, { "loss": 0.0128, "grad_norm": 0.7216500639915466, "learning_rate": 4.4444444444444447e-05, "num_tokens": 192122.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.047200001776218414, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2893427610397339, "rewards/belief_accuracy/std": 0.03550482541322708, "reward": 0.9888283014297485, "reward_std": 0.10651447623968124, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3197862319648266, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0675, "step": 81 }, { "loss": 0.0344, "grad_norm": 2.201247453689575, "learning_rate": 4.4305555555555556e-05, "num_tokens": 194232.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.16130000352859497, "rewards/env_reward/std": 0.2710742652416229, "rewards/belief_accuracy/mean": 0.31966283917427063, "rewards/belief_accuracy/std": 0.07682497054338455, "reward": 0.7670385241508484, "reward_std": 0.4939156472682953, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.8603321500122547, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06833333333333333, "step": 82 }, { "loss": 0.0118, "grad_norm": 1.1376465559005737, "learning_rate": 4.4166666666666665e-05, "num_tokens": 196672.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5903750061988831, "rewards/env_reward/std": 0.039749979972839355, "rewards/belief_accuracy/mean": 0.2325861155986786, "rewards/belief_accuracy/std": 0.10147262364625931, "reward": 1.633320927619934, "reward_std": 0.29003316164016724, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2951663341373205, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06916666666666667, "step": 83 }, { "loss": 0.0322, "grad_norm": 12.477323532104492, "learning_rate": 4.402777777777778e-05, "num_tokens": 199124.0, "completions/mean_length": 13.0, "completions/min_length": 10.0, "completions/max_length": 22.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 22.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.290800005197525, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.340520977973938, "rewards/belief_accuracy/std": 0.03491540253162384, "reward": 1.5077629089355469, "reward_std": 0.10474622994661331, "frac_reward_zero_std": 0.0, "completion_length": 22.0, "kl": 0.8052617497742176, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07, "step": 84 }, { "loss": 0.0187, "grad_norm": 2.133931875228882, "learning_rate": 4.388888888888889e-05, "num_tokens": 201564.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5620499849319458, "rewards/env_reward/std": 0.06709998846054077, "rewards/belief_accuracy/mean": 0.30618077516555786, "rewards/belief_accuracy/std": 0.11218413710594177, "reward": 1.811617374420166, "reward_std": 0.4033048748970032, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.46640536189079285, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07083333333333333, "step": 85 }, { "loss": 0.0137, "grad_norm": 1.235334038734436, "learning_rate": 4.375e-05, "num_tokens": 204004.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.436599999666214, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.17671510577201843, "rewards/belief_accuracy/std": 0.08553336560726166, "reward": 1.235045313835144, "reward_std": 0.25660011172294617, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.34326545894145966, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07166666666666667, "step": 86 }, { "loss": 0.0162, "grad_norm": 3.338106393814087, "learning_rate": 4.3611111111111116e-05, "num_tokens": 206303.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.11562499403953552, "rewards/env_reward/std": 0.19865000247955322, "rewards/belief_accuracy/mean": 0.28272244334220886, "rewards/belief_accuracy/std": 0.06825492531061172, "reward": 1.0716049671173096, "reward_std": 0.33843618631362915, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4051658548414707, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0725, "step": 87 }, { "loss": 0.065, "grad_norm": 8.827055931091309, "learning_rate": 4.3472222222222225e-05, "num_tokens": 208765.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1459999978542328, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3536558151245117, "rewards/belief_accuracy/std": 0.055663757026195526, "reward": 1.3299674987792969, "reward_std": 0.1669912487268448, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 1.6250885128974915, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07333333333333333, "step": 88 }, { "loss": 0.0254, "grad_norm": 1.412042260169983, "learning_rate": 4.3333333333333334e-05, "num_tokens": 211205.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.14069999754428864, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.31789451837539673, "rewards/belief_accuracy/std": 0.03672884404659271, "reward": 1.214733600616455, "reward_std": 0.11018647253513336, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.633812952786684, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07416666666666667, "step": 89 }, { "loss": 0.0236, "grad_norm": 1.7908711433410645, "learning_rate": 4.319444444444445e-05, "num_tokens": 213645.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.047200001776218414, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.272844523191452, "rewards/belief_accuracy/std": 0.1207294762134552, "reward": 0.9393336176872253, "reward_std": 0.3621884286403656, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5902487859129906, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.075, "step": 90 }, { "loss": 0.0454, "grad_norm": 8.704280853271484, "learning_rate": 4.305555555555556e-05, "num_tokens": 216093.0, "completions/mean_length": 12.0, "completions/min_length": 10.0, "completions/max_length": 18.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 12.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 18.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6881999969482422, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.20553426444530487, "rewards/belief_accuracy/std": 0.05557806044816971, "reward": 1.6989027261734009, "reward_std": 0.16673420369625092, "frac_reward_zero_std": 0.0, "completion_length": 18.0, "kl": 1.1360323503613472, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07583333333333334, "step": 91 }, { "loss": 0.0283, "grad_norm": 4.743035793304443, "learning_rate": 4.291666666666667e-05, "num_tokens": 218204.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.48625001311302185, "rewards/env_reward/std": 0.08950001001358032, "rewards/belief_accuracy/mean": 0.24255633354187012, "rewards/belief_accuracy/std": 0.10362043231725693, "reward": 1.5070440769195557, "reward_std": 0.43784603476524353, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.7067995071411133, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07666666666666666, "step": 92 }, { "loss": 0.0111, "grad_norm": 1.3156030178070068, "learning_rate": 4.277777777777778e-05, "num_tokens": 220644.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.555899977684021, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.36422252655029297, "rewards/belief_accuracy/std": 0.045512910932302475, "reward": 1.9765175580978394, "reward_std": 0.13653874397277832, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2781078703701496, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0775, "step": 93 }, { "loss": 0.0077, "grad_norm": 1.8292455673217773, "learning_rate": 4.263888888888889e-05, "num_tokens": 223084.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4133000075817108, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3569084405899048, "rewards/belief_accuracy/std": 0.05303797498345375, "reward": 1.740675449371338, "reward_std": 0.15911391377449036, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.19214149564504623, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07833333333333334, "step": 94 }, { "loss": 0.0229, "grad_norm": 3.2383766174316406, "learning_rate": 4.25e-05, "num_tokens": 225524.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6840999722480774, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.22559703886508942, "rewards/belief_accuracy/std": 0.09126444160938263, "reward": 1.7529411315917969, "reward_std": 0.2737933397293091, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.572553887963295, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07916666666666666, "step": 95 }, { "loss": 0.0225, "grad_norm": 1.971638798713684, "learning_rate": 4.236111111111111e-05, "num_tokens": 227963.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4016000032424927, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.34314823150634766, "rewards/belief_accuracy/std": 0.012927442789077759, "reward": 1.681844711303711, "reward_std": 0.03878229856491089, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5628382042050362, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08, "step": 96 }, { "loss": 0.0089, "grad_norm": 1.9865643978118896, "learning_rate": 4.222222222222222e-05, "num_tokens": 230403.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.19689999520778656, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.33229726552963257, "rewards/belief_accuracy/std": 0.03385661542415619, "reward": 1.3422417640686035, "reward_std": 0.10156978666782379, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.22193955443799496, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08083333333333333, "step": 97 }, { "loss": 0.0083, "grad_norm": 1.4794961214065552, "learning_rate": 4.208333333333334e-05, "num_tokens": 232843.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.3410249948501587, "rewards/env_reward/std": 1.7726500034332275, "rewards/belief_accuracy/mean": 0.16232874989509583, "rewards/belief_accuracy/std": 0.2421344518661499, "reward": -0.02455127239227295, "reward_std": 3.4839961528778076, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2080898880958557, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08166666666666667, "step": 98 }, { "loss": 0.0137, "grad_norm": 2.3845081329345703, "learning_rate": 4.194444444444445e-05, "num_tokens": 235283.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.39559999108314514, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.337656170129776, "rewards/belief_accuracy/std": 0.031664397567510605, "reward": 1.6563684940338135, "reward_std": 0.09499318897724152, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.34126274287700653, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0825, "step": 99 }, { "loss": 0.0287, "grad_norm": 6.240288257598877, "learning_rate": 4.1805555555555556e-05, "num_tokens": 237235.0, "completions/mean_length": 15.0, "completions/min_length": 9.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 9.333333969116211, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4211999773979187, "rewards/env_reward/std": 0.41892534494400024, "rewards/belief_accuracy/mean": 0.18393371999263763, "rewards/belief_accuracy/std": 0.15957118570804596, "reward": 1.2336010932922363, "reward_std": 0.6653106212615967, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.7164427638053894, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08333333333333333, "step": 100 }, { "loss": 0.0105, "grad_norm": 2.361682176589966, "learning_rate": 4.166666666666667e-05, "num_tokens": 239675.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7328000068664551, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.24569477140903473, "rewards/belief_accuracy/std": 0.06995491683483124, "reward": 1.886284351348877, "reward_std": 0.20986472070217133, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2631294522434473, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08416666666666667, "step": 101 }, { "loss": 0.0097, "grad_norm": 1.8391385078430176, "learning_rate": 4.152777777777778e-05, "num_tokens": 241607.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3151000142097473, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3252928853034973, "rewards/belief_accuracy/std": 0.041769690811634064, "reward": 1.4985287189483643, "reward_std": 0.12530909478664398, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.24248860776424408, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.085, "step": 102 }, { "loss": 0.0238, "grad_norm": 21.890640258789062, "learning_rate": 4.138888888888889e-05, "num_tokens": 244069.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.1923999935388565, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.22688540816307068, "rewards/belief_accuracy/std": 0.06325279176235199, "reward": 0.44205623865127563, "reward_std": 0.18975834548473358, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.5941529143601656, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08583333333333333, "step": 103 }, { "loss": 0.0098, "grad_norm": 2.787078380584717, "learning_rate": 4.125e-05, "num_tokens": 246509.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3725000023841858, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3388558626174927, "rewards/belief_accuracy/std": 0.04795009270310402, "reward": 1.6253175735473633, "reward_std": 0.14385031163692474, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2444281131029129, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08666666666666667, "step": 104 }, { "loss": 0.0116, "grad_norm": 1.7878473997116089, "learning_rate": 4.111111111111111e-05, "num_tokens": 248949.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.30709999799728394, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.26045137643814087, "rewards/belief_accuracy/std": 0.06565836817026138, "reward": 1.292004108428955, "reward_std": 0.19697506725788116, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2907773107290268, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0875, "step": 105 }, { "loss": 0.0225, "grad_norm": 0.9713439345359802, "learning_rate": 4.0972222222222225e-05, "num_tokens": 251389.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.045899998396635056, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.37923645973205566, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.2565593719482422, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5633938759565353, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08833333333333333, "step": 106 }, { "loss": 0.0131, "grad_norm": 1.3064583539962769, "learning_rate": 4.0833333333333334e-05, "num_tokens": 253829.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7106000185012817, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2697714567184448, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.9252145290374756, "reward_std": 0.06414999067783356, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3265066295862198, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08916666666666667, "step": 107 }, { "loss": 0.0133, "grad_norm": 2.1808035373687744, "learning_rate": 4.0694444444444444e-05, "num_tokens": 256269.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.06019999831914902, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.1825166642665863, "rewards/belief_accuracy/std": 0.08115018159151077, "reward": 0.6878499984741211, "reward_std": 0.24345053732395172, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.33351393789052963, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09, "step": 108 }, { "loss": 0.0096, "grad_norm": 1.2036610841751099, "learning_rate": 4.055555555555556e-05, "num_tokens": 258709.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.06159999966621399, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.28903988003730774, "rewards/belief_accuracy/std": 0.026118090376257896, "reward": 1.0095196962356567, "reward_std": 0.07835428416728973, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.23895448073744774, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09083333333333334, "step": 109 }, { "loss": 0.0169, "grad_norm": 3.1538102626800537, "learning_rate": 4.041666666666667e-05, "num_tokens": 260824.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.19317498803138733, "rewards/env_reward/std": 0.26084083318710327, "rewards/belief_accuracy/mean": 0.3610350489616394, "rewards/belief_accuracy/std": 0.08047888427972794, "reward": 1.422867774963379, "reward_std": 0.5747905373573303, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4214295297861099, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09166666666666666, "step": 110 }, { "loss": 0.0226, "grad_norm": 2.361459255218506, "learning_rate": 4.027777777777778e-05, "num_tokens": 263286.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.1859000027179718, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.30193030834198, "rewards/belief_accuracy/std": 0.0667286217212677, "reward": 0.67694091796875, "reward_std": 0.2001858800649643, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.5647962130606174, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0925, "step": 111 }, { "loss": 0.0194, "grad_norm": 4.111879348754883, "learning_rate": 4.0138888888888894e-05, "num_tokens": 265395.0, "completions/mean_length": 9.25, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.25, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.03242500126361847, "rewards/env_reward/std": 0.22145001590251923, "rewards/belief_accuracy/mean": 0.32505708932876587, "rewards/belief_accuracy/std": 0.06295914947986603, "reward": 1.0738086700439453, "reward_std": 0.5100609660148621, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.48548348993062973, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09333333333333334, "step": 112 }, { "loss": 0.0179, "grad_norm": 2.7761991024017334, "learning_rate": 4e-05, "num_tokens": 267835.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.019099999219179153, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.36737579107284546, "rewards/belief_accuracy/std": 0.031964562833309174, "reward": 1.1807773113250732, "reward_std": 0.09589369595050812, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.44813764840364456, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09416666666666666, "step": 113 }, { "loss": 0.0078, "grad_norm": 2.152681827545166, "learning_rate": 3.986111111111111e-05, "num_tokens": 270275.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.03530000150203705, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.34184545278549194, "rewards/belief_accuracy/std": 0.04799266904592514, "reward": 1.1284863948822021, "reward_std": 0.14397799968719482, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.19383717328310013, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.095, "step": 114 }, { "loss": 0.0141, "grad_norm": 1.2871294021606445, "learning_rate": 3.972222222222222e-05, "num_tokens": 272715.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7462000250816345, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2832767069339752, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 2.019130229949951, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.35344624519348145, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09583333333333334, "step": 115 }, { "loss": 0.0136, "grad_norm": 4.147212982177734, "learning_rate": 3.958333333333333e-05, "num_tokens": 274441.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.11590000241994858, "rewards/env_reward/std": 0.22251078486442566, "rewards/belief_accuracy/mean": 0.33136284351348877, "rewards/belief_accuracy/std": 0.05834528058767319, "reward": 0.8702385425567627, "reward_std": 0.22163395583629608, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.33963513001799583, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09666666666666666, "step": 116 }, { "loss": 0.0097, "grad_norm": 3.5441172122955322, "learning_rate": 3.944444444444445e-05, "num_tokens": 276903.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2621999979019165, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3731135427951813, "rewards/belief_accuracy/std": 0.07407407462596893, "reward": 1.56264066696167, "reward_std": 0.22222228348255157, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.2427344862371683, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0975, "step": 117 }, { "loss": 0.0194, "grad_norm": 4.864804267883301, "learning_rate": 3.9305555555555556e-05, "num_tokens": 279013.0, "completions/mean_length": 9.5, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4486500024795532, "rewards/env_reward/std": 0.24901118874549866, "rewards/belief_accuracy/mean": 0.3373388946056366, "rewards/belief_accuracy/std": 0.03984580561518669, "reward": 1.7349917888641357, "reward_std": 0.3733745217323303, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4837944805622101, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09833333333333333, "step": 118 }, { "loss": 0.0104, "grad_norm": 2.8735086917877197, "learning_rate": 3.9166666666666665e-05, "num_tokens": 281453.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.24619999527931213, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2666420042514801, "rewards/belief_accuracy/std": 0.061688266694545746, "reward": 1.2192261219024658, "reward_std": 0.18506474792957306, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2612016424536705, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09916666666666667, "step": 119 }, { "loss": 0.0206, "grad_norm": 1.73070228099823, "learning_rate": 3.902777777777778e-05, "num_tokens": 283893.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.337799996137619, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.31319916248321533, "rewards/belief_accuracy/std": 0.045859288424253464, "reward": 1.4962975978851318, "reward_std": 0.13757789134979248, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5154507085680962, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1, "step": 120 }, { "loss": 0.0241, "grad_norm": 1.9448511600494385, "learning_rate": 3.888888888888889e-05, "num_tokens": 286339.0, "completions/mean_length": 11.5, "completions/min_length": 10.0, "completions/max_length": 16.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 16.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2249000072479248, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4207599461078644, "rewards/belief_accuracy/std": 0.0340244434773922, "reward": 1.649629831314087, "reward_std": 0.10207336395978928, "frac_reward_zero_std": 0.0, "completion_length": 16.0, "kl": 0.602842066437006, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10083333333333333, "step": 121 }, { "loss": 0.0154, "grad_norm": 3.713958501815796, "learning_rate": 3.875e-05, "num_tokens": 288779.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4221999943256378, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.20623962581157684, "rewards/belief_accuracy/std": 0.05671283230185509, "reward": 1.3020188808441162, "reward_std": 0.17013852298259735, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.38420072570443153, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10166666666666667, "step": 122 }, { "loss": 0.0076, "grad_norm": 2.8310298919677734, "learning_rate": 3.8611111111111116e-05, "num_tokens": 291219.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.11249999701976776, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3014543652534485, "rewards/belief_accuracy/std": 0.03703703731298447, "reward": 0.7856130599975586, "reward_std": 0.1111110970377922, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.19116491079330444, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1025, "step": 123 }, { "loss": 0.0207, "grad_norm": 6.588038444519043, "learning_rate": 3.8472222222222225e-05, "num_tokens": 293665.0, "completions/mean_length": 11.5, "completions/min_length": 10.0, "completions/max_length": 16.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 16.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3255000114440918, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.22401908040046692, "rewards/belief_accuracy/std": 0.03546026349067688, "reward": 1.210307240486145, "reward_std": 0.10638084262609482, "frac_reward_zero_std": 0.0, "completion_length": 16.0, "kl": 0.5185928121209145, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10333333333333333, "step": 124 }, { "loss": 0.0111, "grad_norm": 3.076918363571167, "learning_rate": 3.8333333333333334e-05, "num_tokens": 296106.0, "completions/mean_length": 10.25, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.13120000064373016, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3789128065109253, "rewards/belief_accuracy/std": 0.03271512687206268, "reward": 1.3835383653640747, "reward_std": 0.09814538806676865, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.277778223156929, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10416666666666667, "step": 125 }, { "loss": 0.0152, "grad_norm": 1.4967398643493652, "learning_rate": 3.8194444444444444e-05, "num_tokens": 298546.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.567300021648407, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.35695987939834595, "rewards/belief_accuracy/std": 0.04681692272424698, "reward": 1.9718296527862549, "reward_std": 0.14045073091983795, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3801155686378479, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.105, "step": 126 }, { "loss": 0.013, "grad_norm": 2.761016368865967, "learning_rate": 3.805555555555555e-05, "num_tokens": 300658.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7590750455856323, "rewards/env_reward/std": 0.2601499855518341, "rewards/belief_accuracy/mean": 0.339000940322876, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 2.205615282058716, "reward_std": 0.4304605722427368, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.32585281878709793, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10583333333333333, "step": 127 }, { "loss": 0.0474, "grad_norm": 5.960784435272217, "learning_rate": 3.791666666666667e-05, "num_tokens": 303102.0, "completions/mean_length": 11.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5572999715805054, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3400188684463501, "rewards/belief_accuracy/std": 0.08212429285049438, "reward": 1.9060065746307373, "reward_std": 0.24637286365032196, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 1.1858660280704498, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10666666666666667, "step": 128 }, { "loss": 0.0178, "grad_norm": 3.856426954269409, "learning_rate": 3.777777777777778e-05, "num_tokens": 305220.0, "completions/mean_length": 10.5, "completions/min_length": 10.0, "completions/max_length": 12.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 12.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.8468999862670898, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.18142487108707428, "rewards/belief_accuracy/std": 0.03703703731298447, "reward": 1.8646245002746582, "reward_std": 0.11111116409301758, "frac_reward_zero_std": 0.0, "completion_length": 12.0, "kl": 0.44381893426179886, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1075, "step": 129 }, { "loss": 0.0154, "grad_norm": 2.8265440464019775, "learning_rate": 3.763888888888889e-05, "num_tokens": 307152.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.11757499724626541, "rewards/env_reward/std": 0.18415001034736633, "rewards/belief_accuracy/mean": 0.36771130561828613, "rewards/belief_accuracy/std": 0.07701562345027924, "reward": 1.3294965028762817, "reward_std": 0.09841816127300262, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.38515937700867653, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10833333333333334, "step": 130 }, { "loss": 0.0087, "grad_norm": 4.664041519165039, "learning_rate": 3.7500000000000003e-05, "num_tokens": 309592.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.04830000177025795, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3575194478034973, "rewards/belief_accuracy/std": 0.052378278225660324, "reward": 1.1950082778930664, "reward_std": 0.15713481605052948, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.21777665056288242, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10916666666666666, "step": 131 }, { "loss": 0.0712, "grad_norm": 2.597440481185913, "learning_rate": 3.736111111111111e-05, "num_tokens": 312035.0, "completions/mean_length": 10.75, "completions/min_length": 10.0, "completions/max_length": 13.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 13.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.42980000376701355, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4375982880592346, "rewards/belief_accuracy/std": 0.030712537467479706, "reward": 2.0074949264526367, "reward_std": 0.09213761240243912, "frac_reward_zero_std": 0.0, "completion_length": 13.0, "kl": 1.7791741862893105, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11, "step": 132 }, { "loss": 0.0143, "grad_norm": 1.8326061964035034, "learning_rate": 3.722222222222222e-05, "num_tokens": 314335.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.35420000553131104, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.43421751260757446, "rewards/belief_accuracy/std": 0.016325272619724274, "reward": 1.8839524984359741, "reward_std": 0.048975855112075806, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.35671964287757874, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11083333333333334, "step": 133 }, { "loss": 0.0136, "grad_norm": 2.641361951828003, "learning_rate": 3.708333333333334e-05, "num_tokens": 316447.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4428499937057495, "rewards/env_reward/std": 0.1823849380016327, "rewards/belief_accuracy/mean": 0.443654328584671, "rewards/belief_accuracy/std": 0.021532177925109863, "reward": 2.0452380180358887, "reward_std": 0.317529559135437, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3387632220983505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11166666666666666, "step": 134 }, { "loss": 0.3615, "grad_norm": 23.68372917175293, "learning_rate": 3.694444444444445e-05, "num_tokens": 318887.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.16949999332427979, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.35687991976737976, "rewards/belief_accuracy/std": 0.06190773472189903, "reward": 1.3748897314071655, "reward_std": 0.185723215341568, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 9.03631467744708, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1125, "step": 135 }, { "loss": 0.0123, "grad_norm": 1.9297913312911987, "learning_rate": 3.6805555555555556e-05, "num_tokens": 321327.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.0325000286102295, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.25393518805503845, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 2.360555648803711, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3083711676299572, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11333333333333333, "step": 136 }, { "loss": 0.0132, "grad_norm": 4.193662166595459, "learning_rate": 3.6666666666666666e-05, "num_tokens": 323806.0, "completions/mean_length": 19.75, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 15.666666984558105, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 27.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.03889999911189079, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3806675672531128, "rewards/belief_accuracy/std": 0.03505498543381691, "reward": 1.2503528594970703, "reward_std": 0.10516492277383804, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.33080876618623734, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11416666666666667, "step": 137 }, { "loss": 0.0193, "grad_norm": 1.0646451711654663, "learning_rate": 3.6527777777777775e-05, "num_tokens": 326246.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.20409999787807465, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.42438197135925293, "rewards/belief_accuracy/std": 0.03543723374605179, "reward": 1.6292959451675415, "reward_std": 0.10631169378757477, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4817114397883415, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.115, "step": 138 }, { "loss": 0.013, "grad_norm": 2.5362420082092285, "learning_rate": 3.638888888888889e-05, "num_tokens": 328686.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.12409999966621399, "rewards/env_reward/std": 0.02619999647140503, "rewards/belief_accuracy/mean": 0.3048698902130127, "rewards/belief_accuracy/std": 0.028253890573978424, "reward": 1.1507596969604492, "reward_std": 0.09127989411354065, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3246644027531147, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11583333333333333, "step": 139 }, { "loss": 0.0135, "grad_norm": 2.2923882007598877, "learning_rate": 3.625e-05, "num_tokens": 330414.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.23544999957084656, "rewards/env_reward/std": 0.234499990940094, "rewards/belief_accuracy/mean": 0.28640225529670715, "rewards/belief_accuracy/std": 0.01621234230697155, "reward": 1.2623817920684814, "reward_std": 0.3048122227191925, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3384244814515114, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11666666666666667, "step": 140 }, { "loss": 0.0146, "grad_norm": 1.7288119792938232, "learning_rate": 3.611111111111111e-05, "num_tokens": 332854.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.34200000762939453, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.27031537890434265, "rewards/belief_accuracy/std": 0.03024062141776085, "reward": 1.373946189880371, "reward_std": 0.09072189033031464, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.36574723571538925, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1175, "step": 141 }, { "loss": 0.014, "grad_norm": 2.0891668796539307, "learning_rate": 3.5972222222222225e-05, "num_tokens": 334970.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.02250000089406967, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2285890281200409, "rewards/belief_accuracy/std": 0.06315205991268158, "reward": 0.7695170640945435, "reward_std": 0.18945620954036713, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3505154550075531, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11833333333333333, "step": 142 }, { "loss": 0.0101, "grad_norm": 3.0063207149505615, "learning_rate": 3.5833333333333335e-05, "num_tokens": 337410.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.13079999387264252, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.35887354612350464, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 0.9304206371307373, "reward_std": 0.055555541068315506, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2529054693877697, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11916666666666667, "step": 143 }, { "loss": 0.0233, "grad_norm": 1.956833839416504, "learning_rate": 3.5694444444444444e-05, "num_tokens": 339850.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.30309998989105225, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.1861644983291626, "rewards/belief_accuracy/std": 0.0555555522441864, "reward": 1.0631434917449951, "reward_std": 0.166666641831398, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5815155953168869, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12, "step": 144 }, { "loss": 0.033, "grad_norm": 6.329380035400391, "learning_rate": 3.555555555555556e-05, "num_tokens": 342313.0, "completions/mean_length": 15.75, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.333333969116211, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.051899999380111694, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3755284249782562, "rewards/belief_accuracy/std": 0.04781460389494896, "reward": 1.0987353324890137, "reward_std": 0.14344383776187897, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.8242634683847427, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12083333333333333, "step": 145 }, { "loss": 0.0159, "grad_norm": 1.634685754776001, "learning_rate": 3.541666666666667e-05, "num_tokens": 344753.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.28029999136924744, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.31596264243125916, "rewards/belief_accuracy/std": 0.02276109904050827, "reward": 1.4183378219604492, "reward_std": 0.06828323751688004, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.39855731278657913, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12166666666666667, "step": 146 }, { "loss": 0.0154, "grad_norm": 1.3948684930801392, "learning_rate": 3.527777777777778e-05, "num_tokens": 347193.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.16419999301433563, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2850886285305023, "rewards/belief_accuracy/std": 0.020031645894050598, "reward": 1.1515657901763916, "reward_std": 0.06009494140744209, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.38383544236421585, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1225, "step": 147 }, { "loss": 0.0242, "grad_norm": 1.6996270418167114, "learning_rate": 3.513888888888889e-05, "num_tokens": 349633.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.13830000162124634, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.30072730779647827, "rewards/belief_accuracy/std": 0.03546027094125748, "reward": 1.1596319675445557, "reward_std": 0.10638084262609482, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.6058650612831116, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12333333333333334, "step": 148 }, { "loss": 0.0137, "grad_norm": 2.426046848297119, "learning_rate": 3.5e-05, "num_tokens": 352073.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.12460000067949295, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.31412917375564575, "rewards/belief_accuracy/std": 0.03284599259495735, "reward": 1.1792874336242676, "reward_std": 0.09853797405958176, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.34162479639053345, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12416666666666666, "step": 149 }, { "loss": 0.0261, "grad_norm": 7.012240409851074, "learning_rate": 3.486111111111111e-05, "num_tokens": 354522.0, "completions/mean_length": 12.25, "completions/min_length": 10.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 12.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.06894999742507935, "rewards/env_reward/std": 0.010099999606609344, "rewards/belief_accuracy/mean": 0.26061785221099854, "rewards/belief_accuracy/std": 0.030240608379244804, "reward": 0.9352785348892212, "reward_std": 0.07883862406015396, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.6527384743094444, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.125, "step": 150 }, { "loss": 0.0274, "grad_norm": 12.943872451782227, "learning_rate": 3.472222222222222e-05, "num_tokens": 356966.0, "completions/mean_length": 11.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6164000034332275, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.28171297907829285, "rewards/belief_accuracy/std": 0.04276669770479202, "reward": 1.8197388648986816, "reward_std": 0.12830005586147308, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 0.6852549761533737, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12583333333333332, "step": 151 }, { "loss": 0.0147, "grad_norm": 2.472937822341919, "learning_rate": 3.458333333333333e-05, "num_tokens": 359407.0, "completions/mean_length": 10.25, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.12862500548362732, "rewards/env_reward/std": 0.012850001454353333, "rewards/belief_accuracy/mean": 0.3199460506439209, "rewards/belief_accuracy/std": 0.009259253740310669, "reward": 1.2027755975723267, "reward_std": 0.02803901769220829, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.3669867143034935, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12666666666666668, "step": 152 }, { "loss": 0.0329, "grad_norm": 1.7880311012268066, "learning_rate": 3.444444444444445e-05, "num_tokens": 361519.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6618000268936157, "rewards/env_reward/std": 0.20403560996055603, "rewards/belief_accuracy/mean": 0.3322972357273102, "rewards/belief_accuracy/std": 0.03908432275056839, "reward": 2.0395917892456055, "reward_std": 0.3787125051021576, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.8225667700171471, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1275, "step": 153 }, { "loss": 0.0165, "grad_norm": 2.15445613861084, "learning_rate": 3.430555555555556e-05, "num_tokens": 363959.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.03149999678134918, "rewards/env_reward/std": 0.09520000219345093, "rewards/belief_accuracy/mean": 0.17216560244560242, "rewards/belief_accuracy/std": 0.018518514931201935, "reward": 0.5192468166351318, "reward_std": 0.08724445104598999, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.41129303723573685, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12833333333333333, "step": 154 }, { "loss": 0.0141, "grad_norm": 1.1737233400344849, "learning_rate": 3.4166666666666666e-05, "num_tokens": 366399.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.0414000004529953, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2698771059513092, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 0.9217313528060913, "reward_std": 0.055555522441864014, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.35325073450803757, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12916666666666668, "step": 155 }, { "loss": 0.0307, "grad_norm": 11.461524963378906, "learning_rate": 3.402777777777778e-05, "num_tokens": 368842.0, "completions/mean_length": 10.75, "completions/min_length": 10.0, "completions/max_length": 13.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 13.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.391400009393692, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.40906423330307007, "rewards/belief_accuracy/std": 0.04156642034649849, "reward": 1.8642927408218384, "reward_std": 0.12469932436943054, "frac_reward_zero_std": 0.0, "completion_length": 13.0, "kl": 0.7672573626041412, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13, "step": 156 }, { "loss": 0.0158, "grad_norm": 1.6521883010864258, "learning_rate": 3.388888888888889e-05, "num_tokens": 371282.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.29260000586509705, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.37149137258529663, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.6033741235733032, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3942847102880478, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13083333333333333, "step": 157 }, { "loss": 0.0162, "grad_norm": 2.972644329071045, "learning_rate": 3.375000000000001e-05, "num_tokens": 373722.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.2614000141620636, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.29159414768218994, "rewards/belief_accuracy/std": 0.007962316274642944, "reward": 0.5326824188232422, "reward_std": 0.02388697862625122, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.40470410883426666, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13166666666666665, "step": 158 }, { "loss": 0.0149, "grad_norm": 2.185356855392456, "learning_rate": 3.3611111111111116e-05, "num_tokens": 376162.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5100749731063843, "rewards/env_reward/std": 0.01074999663978815, "rewards/belief_accuracy/mean": 0.3106195032596588, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.7469711303710938, "reward_std": 0.039430540055036545, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3720128685235977, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1325, "step": 159 }, { "loss": 0.0211, "grad_norm": 6.548361301422119, "learning_rate": 3.347222222222222e-05, "num_tokens": 378607.0, "completions/mean_length": 11.25, "completions/min_length": 10.0, "completions/max_length": 15.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 15.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.12084999680519104, "rewards/env_reward/std": 0.07372763007879257, "rewards/belief_accuracy/mean": 0.3570098876953125, "rewards/belief_accuracy/std": 0.030240608379244804, "reward": 1.302304744720459, "reward_std": 0.1430416703224182, "frac_reward_zero_std": 0.0, "completion_length": 15.0, "kl": 0.527301162481308, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13333333333333333, "step": 160 }, { "loss": 0.0138, "grad_norm": 4.99193000793457, "learning_rate": 3.3333333333333335e-05, "num_tokens": 381047.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.13249999284744263, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.30360713601112366, "rewards/belief_accuracy/std": 0.02821623906493187, "reward": 1.1595714092254639, "reward_std": 0.08464870601892471, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3444783613085747, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13416666666666666, "step": 161 }, { "loss": 0.0183, "grad_norm": 2.1529526710510254, "learning_rate": 3.3194444444444444e-05, "num_tokens": 383509.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.10860000550746918, "rewards/env_reward/std": 0.009930425323545933, "rewards/belief_accuracy/mean": 0.3788100481033325, "rewards/belief_accuracy/std": 0.03024062141776085, "reward": 1.349330186843872, "reward_std": 0.10180103778839111, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.4582265689969063, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.135, "step": 162 }, { "loss": 0.0151, "grad_norm": 1.7175657749176025, "learning_rate": 3.3055555555555553e-05, "num_tokens": 385625.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4354749917984009, "rewards/env_reward/std": 0.4491499960422516, "rewards/belief_accuracy/mean": 0.328631192445755, "rewards/belief_accuracy/std": 0.024180419743061066, "reward": 1.6891059875488281, "reward_std": 0.6854898929595947, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.37873465567827225, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13583333333333333, "step": 163 }, { "loss": 0.0126, "grad_norm": 4.273981094360352, "learning_rate": 3.291666666666667e-05, "num_tokens": 388065.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5026999711990356, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.44292253255844116, "rewards/belief_accuracy/std": 0.035414163023233414, "reward": 2.1328177452087402, "reward_std": 0.10624248534440994, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3159671910107136, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13666666666666666, "step": 164 }, { "loss": 0.0161, "grad_norm": 3.6132190227508545, "learning_rate": 3.277777777777778e-05, "num_tokens": 390505.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.1704999953508377, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.29170024394989014, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 0.6693507432937622, "reward_std": 0.06415002793073654, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4029811918735504, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1375, "step": 165 }, { "loss": 0.0407, "grad_norm": 6.097934246063232, "learning_rate": 3.263888888888889e-05, "num_tokens": 392956.0, "completions/mean_length": 12.75, "completions/min_length": 10.0, "completions/max_length": 17.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 12.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 17.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.039900001138448715, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.35646331310272217, "rewards/belief_accuracy/std": 0.015120310708880424, "reward": 1.1792399883270264, "reward_std": 0.04536087065935135, "frac_reward_zero_std": 0.0, "completion_length": 17.0, "kl": 1.0165367349982262, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13833333333333334, "step": 166 }, { "loss": 0.017, "grad_norm": 6.14641809463501, "learning_rate": 3.2500000000000004e-05, "num_tokens": 395400.0, "completions/mean_length": 11.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.10329999774694443, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.30679982900619507, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.125349521636963, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 0.4241368919610977, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13916666666666666, "step": 167 }, { "loss": 0.0247, "grad_norm": 5.288893222808838, "learning_rate": 3.236111111111111e-05, "num_tokens": 397873.0, "completions/mean_length": 18.25, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 21.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.29954999685287476, "rewards/env_reward/std": 0.03827832639217377, "rewards/belief_accuracy/mean": 0.2795746326446533, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.3380489349365234, "reward_std": 0.051960092037916183, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.6163829118013382, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14, "step": 168 }, { "loss": 0.0211, "grad_norm": 6.591170787811279, "learning_rate": 3.222222222222223e-05, "num_tokens": 400329.0, "completions/mean_length": 14.0, "completions/min_length": 10.0, "completions/max_length": 22.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 22.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.37950000166893005, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3927474319934845, "rewards/belief_accuracy/std": 0.010272752493619919, "reward": 1.7974923849105835, "reward_std": 0.030818266794085503, "frac_reward_zero_std": 0.0, "completion_length": 22.0, "kl": 0.5263254791498184, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14083333333333334, "step": 169 }, { "loss": 0.0311, "grad_norm": 2.159360885620117, "learning_rate": 3.208333333333334e-05, "num_tokens": 402773.0, "completions/mean_length": 11.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.20149999856948853, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.34553590416908264, "rewards/belief_accuracy/std": 0.0, "reward": 1.3888577222824097, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 14.0, "kl": 0.7782266139984131, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14166666666666666, "step": 170 }, { "loss": 0.0469, "grad_norm": 2.983961343765259, "learning_rate": 3.194444444444444e-05, "num_tokens": 405229.0, "completions/mean_length": 14.0, "completions/min_length": 10.0, "completions/max_length": 26.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 26.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.15469999611377716, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4289877712726593, "rewards/belief_accuracy/std": 0.0, "reward": 1.5690133571624756, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 26.0, "kl": 1.172668918967247, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1425, "step": 171 }, { "loss": 0.0127, "grad_norm": 2.0448641777038574, "learning_rate": 3.180555555555556e-05, "num_tokens": 407669.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.04535000026226044, "rewards/env_reward/std": 0.009099999442696571, "rewards/belief_accuracy/mean": 0.3892820477485657, "rewards/belief_accuracy/std": 0.0, "reward": 1.2858712673187256, "reward_std": 0.013649980537593365, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.31760095804929733, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14333333333333334, "step": 172 }, { "loss": 0.0772, "grad_norm": 9.481169700622559, "learning_rate": 3.1666666666666666e-05, "num_tokens": 410116.0, "completions/mean_length": 11.75, "completions/min_length": 10.0, "completions/max_length": 17.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 17.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.34950000047683716, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.32457566261291504, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 1.5479769706726074, "reward_std": 0.05555546283721924, "frac_reward_zero_std": 0.0, "completion_length": 17.0, "kl": 1.9302819445729256, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14416666666666667, "step": 173 }, { "loss": 0.0246, "grad_norm": 5.64962100982666, "learning_rate": 3.1527777777777775e-05, "num_tokens": 412564.0, "completions/mean_length": 12.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 12.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6260499954223633, "rewards/env_reward/std": 0.27230000495910645, "rewards/belief_accuracy/mean": 0.3570099174976349, "rewards/belief_accuracy/std": 0.0, "reward": 2.0601048469543457, "reward_std": 0.4084498882293701, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 0.6141732186079025, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.145, "step": 174 }, { "loss": 0.0247, "grad_norm": 19.281583786010742, "learning_rate": 3.138888888888889e-05, "num_tokens": 415012.0, "completions/mean_length": 12.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 12.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.32510000467300415, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2631944417953491, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.3272333145141602, "reward_std": 0.06415006518363953, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 0.6186791881918907, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14583333333333334, "step": 175 }, { "loss": 0.0132, "grad_norm": 0.0038161983247846365, "learning_rate": 3.125e-05, "num_tokens": 417452.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.34139999747276306, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.26061785221099854, "rewards/belief_accuracy/std": 0.0, "reward": 1.3439536094665527, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.32898589968681335, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14666666666666667, "step": 176 }, { "loss": 0.022, "grad_norm": 0.02249019406735897, "learning_rate": 3.111111111111111e-05, "num_tokens": 419892.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.02329999953508377, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.42342302203178406, "rewards/belief_accuracy/std": 0.0, "reward": 1.3552190065383911, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5504751205444336, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1475, "step": 177 }, { "loss": 0.0131, "grad_norm": 0.004471380263566971, "learning_rate": 3.0972222222222226e-05, "num_tokens": 422332.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4235999882221222, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3073524236679077, "rewards/belief_accuracy/std": 0.0, "reward": 1.6074572801589966, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.3276918828487396, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14833333333333334, "step": 178 }, { "loss": 0.0203, "grad_norm": 12.483266830444336, "learning_rate": 3.0833333333333335e-05, "num_tokens": 424776.0, "completions/mean_length": 11.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.09359999746084213, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2935647964477539, "rewards/belief_accuracy/std": 0.004193440079689026, "reward": 1.071094274520874, "reward_std": 0.012580275535583496, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 0.5065018609166145, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14916666666666667, "step": 179 }, { "loss": 0.0178, "grad_norm": 0.9969475865364075, "learning_rate": 3.069444444444445e-05, "num_tokens": 427260.0, "completions/mean_length": 21.0, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.5, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4616999924182892, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.38445019721984863, "rewards/belief_accuracy/std": 0.0, "reward": 1.8959006071090698, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 32.0, "kl": 0.444540049880743, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15, "step": 180 }, { "loss": 0.0656, "grad_norm": 10.70042610168457, "learning_rate": 3.055555555555556e-05, "num_tokens": 429703.0, "completions/mean_length": 10.75, "completions/min_length": 10.0, "completions/max_length": 13.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 13.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.04580000042915344, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4158470928668976, "rewards/belief_accuracy/std": 0.0, "reward": 1.3662413358688354, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 13.0, "kl": 1.6388273686170578, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15083333333333335, "step": 181 }, { "loss": 0.0424, "grad_norm": 2.0670108795166016, "learning_rate": 3.0416666666666666e-05, "num_tokens": 432159.0, "completions/mean_length": 14.0, "completions/min_length": 14.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.0, "completions/min_terminated_length": 14.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.11574999988079071, "rewards/env_reward/std": 0.004099187441170216, "rewards/belief_accuracy/mean": 0.3253183662891388, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.199580192565918, "reward_std": 0.059318430721759796, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 1.061010330915451, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15166666666666667, "step": 182 }, { "loss": 0.0293, "grad_norm": 4.715631484985352, "learning_rate": 3.0277777777777776e-05, "num_tokens": 434487.0, "completions/mean_length": 11.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4307500123977661, "rewards/env_reward/std": 0.1354999989271164, "rewards/belief_accuracy/mean": 0.22851046919822693, "rewards/belief_accuracy/std": 0.018518514931201935, "reward": 1.3816564083099365, "reward_std": 0.19201354682445526, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 0.733733519911766, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1525, "step": 183 }, { "loss": 0.0246, "grad_norm": 3.4244911670684814, "learning_rate": 3.0138888888888888e-05, "num_tokens": 436931.0, "completions/mean_length": 11.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.04179999977350235, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.33457762002944946, "rewards/belief_accuracy/std": 0.0, "reward": 1.1164329051971436, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 14.0, "kl": 0.615076094865799, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15333333333333332, "step": 184 }, { "loss": 0.0308, "grad_norm": 7.267205238342285, "learning_rate": 3e-05, "num_tokens": 439405.0, "completions/mean_length": 18.5, "completions/min_length": 14.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 14.0, "completions/min_terminated_length": 14.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.17339999973773956, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3959105610847473, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 0.9776316285133362, "reward_std": 0.05555548146367073, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.7707809656858444, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15416666666666667, "step": 185 }, { "loss": 0.0138, "grad_norm": 4.635013103485107, "learning_rate": 2.9861111111111113e-05, "num_tokens": 441845.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2635999917984009, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.40296870470046997, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.654306173324585, "reward_std": 0.06415006518363953, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3450919836759567, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.155, "step": 186 }, { "loss": 0.0374, "grad_norm": 409.9649963378906, "learning_rate": 2.9722222222222223e-05, "num_tokens": 444307.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3379000127315521, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.310455858707428, "rewards/belief_accuracy/std": 0.0016919821500778198, "reward": 1.4882175922393799, "reward_std": 0.005075931549072266, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.933860570192337, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15583333333333332, "step": 187 }, { "loss": 0.0485, "grad_norm": 0.20684058964252472, "learning_rate": 2.9583333333333335e-05, "num_tokens": 446763.0, "completions/mean_length": 14.0, "completions/min_length": 14.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.0, "completions/min_terminated_length": 14.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2190999984741211, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.34553590416908264, "rewards/belief_accuracy/std": 0.0, "reward": 1.4152576923370361, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 14.0, "kl": 1.212807685136795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15666666666666668, "step": 188 }, { "loss": 0.0183, "grad_norm": 2.166232109069824, "learning_rate": 2.9444444444444448e-05, "num_tokens": 448695.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.11839999258518219, "rewards/env_reward/std": 0.13920000195503235, "rewards/belief_accuracy/mean": 0.40842756628990173, "rewards/belief_accuracy/std": 0.0, "reward": 1.4528826475143433, "reward_std": 0.2088000327348709, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.45745784789323807, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1575, "step": 189 }, { "loss": 0.0412, "grad_norm": 0.026986636221408844, "learning_rate": 2.9305555555555557e-05, "num_tokens": 451151.0, "completions/mean_length": 14.0, "completions/min_length": 14.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.0, "completions/min_terminated_length": 14.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.042100001126527786, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.22358080744743347, "rewards/belief_accuracy/std": 0.0, "reward": 0.7838923931121826, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 14.0, "kl": 1.0310405790805817, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15833333333333333, "step": 190 }, { "loss": 0.0161, "grad_norm": 2.072092056274414, "learning_rate": 2.916666666666667e-05, "num_tokens": 453591.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.060850001871585846, "rewards/env_reward/std": 0.13030001521110535, "rewards/belief_accuracy/mean": 0.4303731918334961, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 1.2498445510864258, "reward_std": 0.1845216602087021, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.402814045548439, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15916666666666668, "step": 191 }, { "loss": 0.0208, "grad_norm": 4.364622116088867, "learning_rate": 2.9027777777777782e-05, "num_tokens": 456057.0, "completions/mean_length": 16.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 11.333333969116211, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.600600004196167, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.28782621026039124, "rewards/belief_accuracy/std": 0.0, "reward": 1.8143787384033203, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 32.0, "kl": 0.520959883928299, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16, "step": 192 }, { "loss": 0.0224, "grad_norm": 6.7475361824035645, "learning_rate": 2.8888888888888888e-05, "num_tokens": 458531.0, "completions/mean_length": 18.5, "completions/min_length": 14.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 14.0, "completions/min_terminated_length": 14.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.4886999726295471, "rewards/env_reward/std": 1.6742000579833984, "rewards/belief_accuracy/mean": 0.23410753905773163, "rewards/belief_accuracy/std": 0.2894050180912018, "reward": -0.030727386474609375, "reward_std": 3.4795150756835938, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.5597751773893833, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16083333333333333, "step": 193 }, { "loss": 0.0371, "grad_norm": 2.9581692218780518, "learning_rate": 2.8749999999999997e-05, "num_tokens": 460992.0, "completions/mean_length": 15.25, "completions/min_length": 14.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 15.25, "completions/min_terminated_length": 14.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7976999878883362, "rewards/env_reward/std": 0.1690000295639038, "rewards/belief_accuracy/mean": 0.28782621026039124, "rewards/belief_accuracy/std": 0.0, "reward": 2.1100287437438965, "reward_std": 0.2535001337528229, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.9280332177877426, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16166666666666665, "step": 194 }, { "loss": 0.0383, "grad_norm": 2.230289936065674, "learning_rate": 2.861111111111111e-05, "num_tokens": 463448.0, "completions/mean_length": 14.0, "completions/min_length": 14.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.0, "completions/min_terminated_length": 14.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.18412499129772186, "rewards/env_reward/std": 0.0393499955534935, "rewards/belief_accuracy/mean": 0.46602481603622437, "rewards/belief_accuracy/std": 0.0, "reward": 1.7242618799209595, "reward_std": 0.059025008231401443, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 0.9568361341953278, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1625, "step": 195 }, { "loss": 0.0314, "grad_norm": 0.7262094616889954, "learning_rate": 2.8472222222222223e-05, "num_tokens": 465900.0, "completions/mean_length": 13.0, "completions/min_length": 10.0, "completions/max_length": 16.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 16.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.04560000076889992, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.22358080744743347, "rewards/belief_accuracy/std": 0.0, "reward": 0.789142370223999, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 16.0, "kl": 0.7856011986732483, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16333333333333333, "step": 196 }, { "loss": 0.0197, "grad_norm": 0.02980131469666958, "learning_rate": 2.8333333333333335e-05, "num_tokens": 468340.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2460000067949295, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.269497811794281, "rewards/belief_accuracy/std": 0.0, "reward": 1.2274935245513916, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.49150124192237854, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16416666666666666, "step": 197 }, { "loss": 0.0255, "grad_norm": 0.6604632139205933, "learning_rate": 2.8194444444444445e-05, "num_tokens": 470784.0, "completions/mean_length": 11.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.2312999963760376, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4617558717727661, "rewards/belief_accuracy/std": 0.0, "reward": 1.088317632675171, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 14.0, "kl": 0.6369934007525444, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.165, "step": 198 }, { "loss": 0.0534, "grad_norm": 1.6423118114471436, "learning_rate": 2.8055555555555557e-05, "num_tokens": 473256.0, "completions/mean_length": 18.0, "completions/min_length": 14.0, "completions/max_length": 30.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 18.0, "completions/min_terminated_length": 14.0, "completions/max_terminated_length": 30.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.019899999722838402, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.386385977268219, "rewards/belief_accuracy/std": 0.0, "reward": 1.2390079498291016, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 30.0, "kl": 1.3361373841762543, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16583333333333333, "step": 199 }, { "loss": 0.0299, "grad_norm": 0.3381861746311188, "learning_rate": 2.791666666666667e-05, "num_tokens": 475704.0, "completions/mean_length": 12.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 12.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.04659999907016754, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.386385977268219, "rewards/belief_accuracy/std": 0.0, "reward": 1.2790579795837402, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 14.0, "kl": 0.7480977475643158, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16666666666666666, "step": 200 }, { "loss": 0.0629, "grad_norm": 3.254331588745117, "learning_rate": 2.777777777777778e-05, "num_tokens": 478178.0, "completions/mean_length": 18.5, "completions/min_length": 14.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 14.0, "completions/min_terminated_length": 14.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.10769999772310257, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.37561896443367004, "rewards/belief_accuracy/std": 0.0, "reward": 1.3384069204330444, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 32.0, "kl": 1.571349710226059, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1675, "step": 201 }, { "loss": 0.0192, "grad_norm": 0.34370848536491394, "learning_rate": 2.7638888888888892e-05, "num_tokens": 480621.0, "completions/mean_length": 10.75, "completions/min_length": 10.0, "completions/max_length": 13.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 13.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.15160000324249268, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3113018572330475, "rewards/belief_accuracy/std": 0.0, "reward": 1.2113056182861328, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 13.0, "kl": 0.48092682659626007, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16833333333333333, "step": 202 }, { "loss": 0.0117, "grad_norm": 21.64349937438965, "learning_rate": 2.7500000000000004e-05, "num_tokens": 483083.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4523000121116638, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.29648277163505554, "rewards/belief_accuracy/std": 0.0, "reward": 1.6178982257843018, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 32.0, "kl": 0.2917898967862129, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16916666666666666, "step": 203 }, { "loss": 0.0179, "grad_norm": 0.006457047536969185, "learning_rate": 2.7361111111111114e-05, "num_tokens": 485523.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.03180000185966492, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.39455646276474, "rewards/belief_accuracy/std": 0.0, "reward": 1.2813693284988403, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.4465770423412323, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17, "step": 204 }, { "loss": 0.0361, "grad_norm": 2.0349791049957275, "learning_rate": 2.7222222222222223e-05, "num_tokens": 487829.0, "completions/mean_length": 11.5, "completions/min_length": 9.0, "completions/max_length": 17.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.5, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 17.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5771750211715698, "rewards/env_reward/std": 0.09858879446983337, "rewards/belief_accuracy/mean": 0.3476565480232239, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.9587321281433105, "reward_std": 0.12462284415960312, "frac_reward_zero_std": 0.0, "completion_length": 17.0, "kl": 0.9023318737745285, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17083333333333334, "step": 205 }, { "loss": 0.0244, "grad_norm": 17.28134536743164, "learning_rate": 2.7083333333333332e-05, "num_tokens": 490279.0, "completions/mean_length": 12.5, "completions/min_length": 10.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 12.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.046300001442432404, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.44797658920288086, "rewards/belief_accuracy/std": 0.002631261944770813, "reward": 1.4633797407150269, "reward_std": 0.007893800735473633, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.6101161614060402, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17166666666666666, "step": 206 }, { "loss": 0.0219, "grad_norm": 0.001407866133376956, "learning_rate": 2.6944444444444445e-05, "num_tokens": 492719.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.31299999356269836, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.269497811794281, "rewards/belief_accuracy/std": 0.0, "reward": 1.327993392944336, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5468098521232605, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1725, "step": 207 }, { "loss": 0.0237, "grad_norm": 5.544673919677734, "learning_rate": 2.6805555555555557e-05, "num_tokens": 495028.0, "completions/mean_length": 12.25, "completions/min_length": 10.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 12.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6684750318527222, "rewards/env_reward/std": 0.24135003983974457, "rewards/belief_accuracy/mean": 0.29648277163505554, "rewards/belief_accuracy/std": 0.0, "reward": 1.942160725593567, "reward_std": 0.36202511191368103, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 0.5928681045770645, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17333333333333334, "step": 208 }, { "loss": 0.0213, "grad_norm": 0.25935015082359314, "learning_rate": 2.6666666666666667e-05, "num_tokens": 497488.0, "completions/mean_length": 15.0, "completions/min_length": 10.0, "completions/max_length": 26.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 15.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 26.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.40459999442100525, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4617558717727661, "rewards/belief_accuracy/std": 0.0, "reward": 2.0421676635742188, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 26.0, "kl": 0.5315433144569397, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17416666666666666, "step": 209 }, { "loss": 0.0198, "grad_norm": 4.167694091796875, "learning_rate": 2.652777777777778e-05, "num_tokens": 499928.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6302000284194946, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2970854640007019, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.8865565061569214, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.49413691461086273, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.175, "step": 210 }, { "loss": 0.0172, "grad_norm": 0.009997401386499405, "learning_rate": 2.6388888888888892e-05, "num_tokens": 502368.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7775999903678894, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3181929886341095, "rewards/belief_accuracy/std": 0.0, "reward": 2.1709790229797363, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.4289768636226654, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17583333333333334, "step": 211 }, { "loss": 0.0178, "grad_norm": 4.784295558929443, "learning_rate": 2.625e-05, "num_tokens": 504808.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.34290000796318054, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3476565480232239, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.6073195934295654, "reward_std": 0.05555550381541252, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4447771683335304, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17666666666666667, "step": 212 }, { "loss": 0.0174, "grad_norm": 0.017436975613236427, "learning_rate": 2.6111111111111114e-05, "num_tokens": 507248.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5629000067710876, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2377697229385376, "rewards/belief_accuracy/std": 0.0, "reward": 1.607659101486206, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.4346160590648651, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1775, "step": 213 }, { "loss": 0.0248, "grad_norm": 12.183658599853516, "learning_rate": 2.5972222222222226e-05, "num_tokens": 509202.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.5482999682426453, "rewards/env_reward/std": 1.6436470746994019, "rewards/belief_accuracy/mean": 0.18266406655311584, "rewards/belief_accuracy/std": 0.25510939955711365, "reward": -0.27445781230926514, "reward_std": 3.327219009399414, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.6210581138730049, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17833333333333334, "step": 214 }, { "loss": 0.0216, "grad_norm": 0.0025398649740964174, "learning_rate": 2.5833333333333336e-05, "num_tokens": 511642.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.035100001841783524, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3102187514305115, "rewards/belief_accuracy/std": 0.0, "reward": 0.9280062317848206, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5393896698951721, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17916666666666667, "step": 215 }, { "loss": 0.0204, "grad_norm": 3.7643086910247803, "learning_rate": 2.5694444444444445e-05, "num_tokens": 514082.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.07569999992847443, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.34864068031311035, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.2094720602035522, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5103324726223946, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18, "step": 216 }, { "loss": 0.0357, "grad_norm": 4.647409439086914, "learning_rate": 2.5555555555555554e-05, "num_tokens": 515817.0, "completions/mean_length": 11.75, "completions/min_length": 10.0, "completions/max_length": 15.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 15.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4375, "rewards/env_reward/std": 0.22679999470710754, "rewards/belief_accuracy/mean": 0.41240280866622925, "rewards/belief_accuracy/std": 0.046603914350271225, "reward": 1.9434584379196167, "reward_std": 0.26885563135147095, "frac_reward_zero_std": 0.0, "completion_length": 15.0, "kl": 0.8914579898118973, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18083333333333335, "step": 217 }, { "loss": 0.023, "grad_norm": 4.6908979415893555, "learning_rate": 2.5416666666666667e-05, "num_tokens": 518257.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.028999999165534973, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3233530819416046, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.0635591745376587, "reward_std": 0.055555541068315506, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5759952142834663, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18166666666666667, "step": 218 }, { "loss": 0.0159, "grad_norm": 5.2558979988098145, "learning_rate": 2.527777777777778e-05, "num_tokens": 520698.0, "completions/mean_length": 10.25, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.032999999821186066, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.44556546211242676, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.4361964464187622, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.39761848002672195, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1825, "step": 219 }, { "loss": 0.0136, "grad_norm": 13.562455177307129, "learning_rate": 2.513888888888889e-05, "num_tokens": 523160.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.021700000390410423, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.366778701543808, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 1.1828861236572266, "reward_std": 0.05555546283721924, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.3410550318658352, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18333333333333332, "step": 220 }, { "loss": 0.0449, "grad_norm": 31.85221290588379, "learning_rate": 2.5e-05, "num_tokens": 525606.0, "completions/mean_length": 11.5, "completions/min_length": 10.0, "completions/max_length": 16.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 16.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.18029999732971191, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.43429505825042725, "rewards/belief_accuracy/std": 0.026091037318110466, "reward": 1.623335361480713, "reward_std": 0.07827307283878326, "frac_reward_zero_std": 0.0, "completion_length": 16.0, "kl": 1.1215265393257141, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18416666666666667, "step": 221 }, { "loss": 0.0261, "grad_norm": 0.04103556647896767, "learning_rate": 2.4861111111111114e-05, "num_tokens": 528046.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.46320000290870667, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3248632550239563, "rewards/belief_accuracy/std": 0.0, "reward": 1.719389796257019, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.6520499587059021, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.185, "step": 222 }, { "loss": 0.0206, "grad_norm": 25.01701545715332, "learning_rate": 2.4722222222222223e-05, "num_tokens": 530512.0, "completions/mean_length": 16.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 11.333333969116211, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.04670000076293945, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3286855220794678, "rewards/belief_accuracy/std": 0.03207503259181976, "reward": 1.1061065196990967, "reward_std": 0.09622509032487869, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.5154426582157612, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18583333333333332, "step": 223 }, { "loss": 0.0254, "grad_norm": 21.70404052734375, "learning_rate": 2.4583333333333332e-05, "num_tokens": 532970.0, "completions/mean_length": 14.5, "completions/min_length": 10.0, "completions/max_length": 18.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 18.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2500999867916107, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.44232407212257385, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.7521222829818726, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 18.0, "kl": 0.6355895921587944, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18666666666666668, "step": 224 }, { "loss": 0.0204, "grad_norm": 3.6001179218292236, "learning_rate": 2.4444444444444445e-05, "num_tokens": 535410.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.27079999446868896, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.37087196111679077, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.56881582736969, "reward_std": 0.06415006518363953, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.509816125035286, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1875, "step": 225 }, { "loss": 0.0297, "grad_norm": 4.272892951965332, "learning_rate": 2.4305555555555558e-05, "num_tokens": 537862.0, "completions/mean_length": 13.0, "completions/min_length": 9.0, "completions/max_length": 17.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.0, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 17.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.34724998474121094, "rewards/env_reward/std": 0.20989999175071716, "rewards/belief_accuracy/mean": 0.3669257164001465, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.6716521978378296, "reward_std": 0.282707542181015, "frac_reward_zero_std": 0.0, "completion_length": 17.0, "kl": 0.7432731539011002, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18833333333333332, "step": 226 }, { "loss": 0.0186, "grad_norm": 0.01726493611931801, "learning_rate": 2.4166666666666667e-05, "num_tokens": 540302.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.10869999974966049, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.32821834087371826, "rewards/belief_accuracy/std": 0.0, "reward": 0.8716050386428833, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.46494513750076294, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18916666666666668, "step": 227 }, { "loss": 0.038, "grad_norm": 0.8162057399749756, "learning_rate": 2.402777777777778e-05, "num_tokens": 542757.0, "completions/mean_length": 13.75, "completions/min_length": 10.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6843000054359436, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3575194478034973, "rewards/belief_accuracy/std": 0.0, "reward": 2.149008274078369, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 19.0, "kl": 0.9498792588710785, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19, "step": 228 }, { "loss": 0.0196, "grad_norm": 0.008917681872844696, "learning_rate": 2.3888888888888892e-05, "num_tokens": 545197.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.08479999750852585, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3493489623069763, "rewards/belief_accuracy/std": 0.0, "reward": 1.225246787071228, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.48936086893081665, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19083333333333333, "step": 229 }, { "loss": 0.0251, "grad_norm": 10.427980422973633, "learning_rate": 2.375e-05, "num_tokens": 547644.0, "completions/mean_length": 11.75, "completions/min_length": 10.0, "completions/max_length": 16.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 16.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.45249998569488525, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3572555184364319, "rewards/belief_accuracy/std": 0.008584737777709961, "reward": 1.8005164861679077, "reward_std": 0.025754213333129883, "frac_reward_zero_std": 0.0, "completion_length": 16.0, "kl": 0.6280895695090294, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19166666666666668, "step": 230 }, { "loss": 0.0262, "grad_norm": 0.009933769702911377, "learning_rate": 2.361111111111111e-05, "num_tokens": 550084.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4244000017642975, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.23698042333126068, "rewards/belief_accuracy/std": 0.0, "reward": 1.3975412845611572, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.6541983485221863, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1925, "step": 231 }, { "loss": 0.0464, "grad_norm": 7.480578899383545, "learning_rate": 2.3472222222222223e-05, "num_tokens": 552528.0, "completions/mean_length": 11.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.04230000078678131, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3575194478034973, "rewards/belief_accuracy/std": 0.0, "reward": 1.1860084533691406, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 14.0, "kl": 1.1603283882141113, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19333333333333333, "step": 232 }, { "loss": 0.0348, "grad_norm": 3.052133798599243, "learning_rate": 2.3333333333333336e-05, "num_tokens": 554260.0, "completions/mean_length": 11.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.4188750088214874, "rewards/env_reward/std": 1.720750093460083, "rewards/belief_accuracy/mean": 0.21635779738426208, "rewards/belief_accuracy/std": 0.28025466203689575, "reward": 0.02076089382171631, "reward_std": 3.515756607055664, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 0.8705815225839615, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19416666666666665, "step": 233 }, { "loss": 0.0257, "grad_norm": 0.004772593267261982, "learning_rate": 2.3194444444444445e-05, "num_tokens": 556700.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6687999963760376, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3552300035953522, "rewards/belief_accuracy/std": 0.0, "reward": 2.1188900470733643, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.6416133642196655, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.195, "step": 234 }, { "loss": 0.0258, "grad_norm": 0.5717759132385254, "learning_rate": 2.3055555555555558e-05, "num_tokens": 559153.0, "completions/mean_length": 13.25, "completions/min_length": 10.0, "completions/max_length": 23.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 23.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6043999791145325, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.45852428674697876, "rewards/belief_accuracy/std": 0.0, "reward": 2.3321728706359863, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 23.0, "kl": 0.6454999148845673, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19583333333333333, "step": 235 }, { "loss": 0.0205, "grad_norm": 0.0032116181682795286, "learning_rate": 2.2916666666666667e-05, "num_tokens": 561593.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.05490000173449516, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3623640537261963, "rewards/belief_accuracy/std": 0.0, "reward": 1.2194421291351318, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5121519565582275, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19666666666666666, "step": 236 }, { "loss": 0.0325, "grad_norm": 7.710559844970703, "learning_rate": 2.277777777777778e-05, "num_tokens": 563535.0, "completions/mean_length": 12.5, "completions/min_length": 10.0, "completions/max_length": 20.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 12.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 20.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.151924967765808, "rewards/env_reward/std": 0.281749963760376, "rewards/belief_accuracy/mean": 0.24253758788108826, "rewards/belief_accuracy/std": 0.03546026349067688, "reward": 2.505500316619873, "reward_std": 0.45341095328330994, "frac_reward_zero_std": 0.0, "completion_length": 20.0, "kl": 0.8115118741989136, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1975, "step": 237 }, { "loss": 0.0256, "grad_norm": 0.0054448568262159824, "learning_rate": 2.263888888888889e-05, "num_tokens": 565975.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1363999992609024, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3922864496707916, "rewards/belief_accuracy/std": 0.0, "reward": 1.4314594268798828, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.6401026844978333, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19833333333333333, "step": 238 }, { "loss": 0.0244, "grad_norm": 0.48050761222839355, "learning_rate": 2.25e-05, "num_tokens": 568418.0, "completions/mean_length": 10.75, "completions/min_length": 10.0, "completions/max_length": 13.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 13.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5138999819755554, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2559005320072174, "rewards/belief_accuracy/std": 0.0, "reward": 1.5885515213012695, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 13.0, "kl": 0.6103616803884506, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19916666666666666, "step": 239 }, { "loss": 0.0212, "grad_norm": 0.010801396332681179, "learning_rate": 2.2361111111111114e-05, "num_tokens": 570858.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.22370000183582306, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.44220685958862305, "rewards/belief_accuracy/std": 0.0, "reward": 1.7121706008911133, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5299305319786072, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2, "step": 240 }, { "loss": 0.0259, "grad_norm": 0.005526633467525244, "learning_rate": 2.2222222222222223e-05, "num_tokens": 573298.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5411999821662903, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.25235405564308167, "rewards/belief_accuracy/std": 0.0, "reward": 1.6188621520996094, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.647596538066864, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.20083333333333334, "step": 241 }, { "loss": 0.0196, "grad_norm": 3.6879804134368896, "learning_rate": 2.2083333333333333e-05, "num_tokens": 575744.0, "completions/mean_length": 11.5, "completions/min_length": 10.0, "completions/max_length": 16.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 16.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2443999946117401, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3609590232372284, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.4994770288467407, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 16.0, "kl": 0.4897921085357666, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.20166666666666666, "step": 242 }, { "loss": 0.0205, "grad_norm": 0.002229674719274044, "learning_rate": 2.1944444444444445e-05, "num_tokens": 578184.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.15629999339580536, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.44499048590660095, "rewards/belief_accuracy/std": 0.0, "reward": 1.6194214820861816, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5125965476036072, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2025, "step": 243 }, { "loss": 0.0195, "grad_norm": 0.019604183733463287, "learning_rate": 2.1805555555555558e-05, "num_tokens": 580624.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3903000056743622, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2332783341407776, "rewards/belief_accuracy/std": 0.0, "reward": 1.335284948348999, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.488160103559494, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.20333333333333334, "step": 244 }, { "loss": 0.0213, "grad_norm": 12.400581359863281, "learning_rate": 2.1666666666666667e-05, "num_tokens": 583070.0, "completions/mean_length": 11.5, "completions/min_length": 10.0, "completions/max_length": 16.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 16.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3977000117301941, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.37080714106559753, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.7589713335037231, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 16.0, "kl": 0.5316231548786163, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.20416666666666666, "step": 245 }, { "loss": 0.0199, "grad_norm": 0.09725174307823181, "learning_rate": 2.152777777777778e-05, "num_tokens": 585374.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.9031000137329102, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.35569527745246887, "rewards/belief_accuracy/std": 0.0, "reward": 2.471735954284668, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.49825286865234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.205, "step": 246 }, { "loss": 0.0266, "grad_norm": 0.0035387862008064985, "learning_rate": 2.138888888888889e-05, "num_tokens": 587814.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.43540000915527344, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2332783341407776, "rewards/belief_accuracy/std": 0.0, "reward": 1.4029350280761719, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.6661872863769531, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.20583333333333334, "step": 247 }, { "loss": 0.021, "grad_norm": 0.028803575783967972, "learning_rate": 2.125e-05, "num_tokens": 590254.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2700999975204468, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.37139055132865906, "rewards/belief_accuracy/std": 0.0, "reward": 1.569321632385254, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5254324078559875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.20666666666666667, "step": 248 }, { "loss": 0.0198, "grad_norm": 0.0019563257228583097, "learning_rate": 2.111111111111111e-05, "num_tokens": 592694.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1324000060558319, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3335198163986206, "rewards/belief_accuracy/std": 0.0, "reward": 1.2491594552993774, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.4938497245311737, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2075, "step": 249 }, { "loss": 0.0242, "grad_norm": 0.028957560658454895, "learning_rate": 2.0972222222222223e-05, "num_tokens": 595134.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.17980000376701355, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.37641847133636475, "rewards/belief_accuracy/std": 0.0, "reward": 1.4489554166793823, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.6057292819023132, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.20833333333333334, "step": 250 }, { "loss": 0.0236, "grad_norm": 0.007945528253912926, "learning_rate": 2.0833333333333336e-05, "num_tokens": 597574.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.16300000250339508, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.44220685958862305, "rewards/belief_accuracy/std": 0.0, "reward": 1.621120572090149, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5903597474098206, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.20916666666666667, "step": 251 }, { "loss": 0.0215, "grad_norm": 0.0038991058245301247, "learning_rate": 2.0694444444444445e-05, "num_tokens": 600014.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.10509999841451645, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2731817066669464, "rewards/belief_accuracy/std": 0.0, "reward": 1.0271950960159302, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5373863577842712, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21, "step": 252 }, { "loss": 0.0194, "grad_norm": 6.581563949584961, "learning_rate": 2.0555555555555555e-05, "num_tokens": 602456.0, "completions/mean_length": 10.5, "completions/min_length": 10.0, "completions/max_length": 12.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 12.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.056299999356269836, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3147966265678406, "rewards/belief_accuracy/std": 0.027777792885899544, "reward": 1.0788397789001465, "reward_std": 0.08333337306976318, "frac_reward_zero_std": 0.0, "completion_length": 12.0, "kl": 0.48605240881443024, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21083333333333334, "step": 253 }, { "loss": 0.0186, "grad_norm": 5.765892505645752, "learning_rate": 2.0416666666666667e-05, "num_tokens": 604756.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4338250160217285, "rewards/env_reward/std": 0.043549999594688416, "rewards/belief_accuracy/mean": 0.20506228506565094, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.3159242868423462, "reward_std": 0.05952895060181618, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4648255370557308, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21166666666666667, "step": 254 }, { "loss": 0.1312, "grad_norm": 13.353157043457031, "learning_rate": 2.027777777777778e-05, "num_tokens": 606484.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6255000233650208, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.25628823041915894, "rewards/belief_accuracy/std": 0.06415002793073654, "reward": 1.7571148872375488, "reward_std": 0.19245007634162903, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 3.2804705798625946, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2125, "step": 255 }, { "loss": 0.0262, "grad_norm": 0.006790816783905029, "learning_rate": 2.013888888888889e-05, "num_tokens": 608924.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5291000008583069, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3939528167247772, "rewards/belief_accuracy/std": 0.0, "reward": 2.025508403778076, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.6560365557670593, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21333333333333335, "step": 256 }, { "loss": 0.0297, "grad_norm": 0.008915513753890991, "learning_rate": 2e-05, "num_tokens": 611364.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3928000032901764, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3922864496707916, "rewards/belief_accuracy/std": 0.0, "reward": 1.8160593509674072, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.7423607707023621, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21416666666666667, "step": 257 }, { "loss": 0.0232, "grad_norm": 0.008568666875362396, "learning_rate": 1.986111111111111e-05, "num_tokens": 613804.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4122999906539917, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2332783341407776, "rewards/belief_accuracy/std": 0.0, "reward": 1.3682849407196045, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5796423554420471, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.215, "step": 258 }, { "loss": 0.0253, "grad_norm": 0.015973366796970367, "learning_rate": 1.9722222222222224e-05, "num_tokens": 616244.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5038999915122986, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.396670401096344, "rewards/belief_accuracy/std": 0.0, "reward": 1.995861291885376, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.6325451135635376, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21583333333333332, "step": 259 }, { "loss": 0.0625, "grad_norm": 10.546822547912598, "learning_rate": 1.9583333333333333e-05, "num_tokens": 618696.0, "completions/mean_length": 13.0, "completions/min_length": 10.0, "completions/max_length": 18.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 18.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3395000100135803, "rewards/env_reward/std": 0.1738000065088272, "rewards/belief_accuracy/mean": 0.3621312975883484, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.645643949508667, "reward_std": 0.2840888798236847, "frac_reward_zero_std": 0.0, "completion_length": 18.0, "kl": 1.5618795603513718, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21666666666666667, "step": 260 }, { "loss": 0.0208, "grad_norm": 5.100569248199463, "learning_rate": 1.9444444444444445e-05, "num_tokens": 621136.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.8407999873161316, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.33726853132247925, "rewards/belief_accuracy/std": 0.02138333022594452, "reward": 2.323005437850952, "reward_std": 0.06415006518363953, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5200706869363785, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2175, "step": 261 }, { "loss": 0.0731, "grad_norm": 1.2731825113296509, "learning_rate": 1.9305555555555558e-05, "num_tokens": 623576.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5206000208854675, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.37815189361572266, "rewards/belief_accuracy/std": 0.03703702986240387, "reward": 1.965355634689331, "reward_std": 0.11111104488372803, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 1.8271799832582474, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21833333333333332, "step": 262 }, { "loss": 0.0314, "grad_norm": 2.086411952972412, "learning_rate": 1.9166666666666667e-05, "num_tokens": 625897.0, "completions/mean_length": 15.25, "completions/min_length": 9.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 9.666666984558105, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.17422500252723694, "rewards/env_reward/std": 0.36454999446868896, "rewards/belief_accuracy/mean": 0.24253758788108826, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.0389503240585327, "reward_std": 0.5677647590637207, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.7842119336128235, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21916666666666668, "step": 263 }, { "loss": 0.0357, "grad_norm": 24.3664608001709, "learning_rate": 1.9027777777777776e-05, "num_tokens": 628354.0, "completions/mean_length": 14.25, "completions/min_length": 10.0, "completions/max_length": 27.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 27.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2946000099182129, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3737679123878479, "rewards/belief_accuracy/std": 0.03703702986240387, "reward": 1.613203763961792, "reward_std": 0.11111104488372803, "frac_reward_zero_std": 0.0, "completion_length": 27.0, "kl": 0.8927547335624695, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22, "step": 264 }, { "loss": 0.0278, "grad_norm": 6.769376754760742, "learning_rate": 1.888888888888889e-05, "num_tokens": 630816.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7105250358581543, "rewards/env_reward/std": 0.0034499764442443848, "rewards/belief_accuracy/mean": 0.3575194478034973, "rewards/belief_accuracy/std": 0.0, "reward": 2.1883459091186523, "reward_std": 0.005174954887479544, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.6946739554405212, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22083333333333333, "step": 265 }, { "loss": 0.0181, "grad_norm": 4.612954616546631, "learning_rate": 1.8750000000000002e-05, "num_tokens": 633256.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.05389999970793724, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3394756317138672, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.149276852607727, "reward_std": 0.06415006518363953, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.45268136262893677, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22166666666666668, "step": 266 }, { "loss": 0.0118, "grad_norm": 3.7529609203338623, "learning_rate": 1.861111111111111e-05, "num_tokens": 635696.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.20900000631809235, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.28013184666633606, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.2038955688476562, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2946026027202606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2225, "step": 267 }, { "loss": 0.0163, "grad_norm": 5.185886383056641, "learning_rate": 1.8472222222222224e-05, "num_tokens": 638136.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.30160000920295715, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.29433107376098633, "rewards/belief_accuracy/std": 0.014091731980443, "reward": 1.3853931427001953, "reward_std": 0.04227517917752266, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4079572707414627, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22333333333333333, "step": 268 }, { "loss": 0.0159, "grad_norm": 4.883063793182373, "learning_rate": 1.8333333333333333e-05, "num_tokens": 640599.0, "completions/mean_length": 15.75, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.333333969116211, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5027999877929688, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.39204078912734985, "rewards/belief_accuracy/std": 0.04375755414366722, "reward": 1.9803223609924316, "reward_std": 0.13127277791500092, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.3966846764087677, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22416666666666665, "step": 269 }, { "loss": 0.0213, "grad_norm": 3.5628440380096436, "learning_rate": 1.8194444444444445e-05, "num_tokens": 643039.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.16940000653266907, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4062002897262573, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.0145008563995361, "reward_std": 0.06414999067783356, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5333509296178818, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.225, "step": 270 }, { "loss": 0.014, "grad_norm": 3.8872599601745605, "learning_rate": 1.8055555555555555e-05, "num_tokens": 645479.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.03889999911189079, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3297416567802429, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.097575068473816, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3496560603380203, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22583333333333333, "step": 271 }, { "loss": 0.0211, "grad_norm": 4.416809558868408, "learning_rate": 1.7916666666666667e-05, "num_tokens": 647919.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.49219998717308044, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.31531640887260437, "rewards/belief_accuracy/std": 0.10029676556587219, "reward": 1.7342491149902344, "reward_std": 0.30089032649993896, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5263015776872635, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22666666666666666, "step": 272 }, { "loss": 0.0292, "grad_norm": 30.599613189697266, "learning_rate": 1.777777777777778e-05, "num_tokens": 650373.0, "completions/mean_length": 13.5, "completions/min_length": 10.0, "completions/max_length": 24.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 24.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2985999882221222, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2865440845489502, "rewards/belief_accuracy/std": 0.026354150846600533, "reward": 1.357532262802124, "reward_std": 0.07906243950128555, "frac_reward_zero_std": 0.0, "completion_length": 24.0, "kl": 0.7304297238588333, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2275, "step": 273 }, { "loss": 0.0342, "grad_norm": 0.20082256197929382, "learning_rate": 1.763888888888889e-05, "num_tokens": 652835.0, "completions/mean_length": 15.5, "completions/min_length": 14.0, "completions/max_length": 16.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 15.5, "completions/min_terminated_length": 14.0, "completions/max_terminated_length": 16.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.1923000067472458, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3652553856372833, "rewards/belief_accuracy/std": 0.0, "reward": 0.8573161959648132, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 16.0, "kl": 0.854352742433548, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22833333333333333, "step": 274 }, { "loss": 0.0958, "grad_norm": 7.556239128112793, "learning_rate": 1.75e-05, "num_tokens": 654947.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.4742000102996826, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.338287353515625, "rewards/belief_accuracy/std": 0.07964453846216202, "reward": 3.2761619091033936, "reward_std": 0.23893354833126068, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 2.3944233655929565, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22916666666666666, "step": 275 }, { "loss": 0.0208, "grad_norm": 0.03813393786549568, "learning_rate": 1.736111111111111e-05, "num_tokens": 657387.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.08429999649524689, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2742648124694824, "rewards/belief_accuracy/std": 0.0, "reward": 0.9992444515228271, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5205760598182678, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23, "step": 276 }, { "loss": 0.0133, "grad_norm": 0.15289942920207977, "learning_rate": 1.7222222222222224e-05, "num_tokens": 659827.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4259999990463257, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3825729489326477, "rewards/belief_accuracy/std": 0.0, "reward": 1.8367189168930054, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.33351561427116394, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23083333333333333, "step": 277 }, { "loss": 0.023, "grad_norm": 8.329577445983887, "learning_rate": 1.7083333333333333e-05, "num_tokens": 661781.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.250900000333786, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3728598952293396, "rewards/belief_accuracy/std": 0.061039846390485764, "reward": 1.5449297428131104, "reward_std": 0.18311955034732819, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.5746472030878067, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23166666666666666, "step": 278 }, { "loss": 0.0241, "grad_norm": 0.014571048319339752, "learning_rate": 1.6944444444444446e-05, "num_tokens": 664221.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.20360000431537628, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.338581919670105, "rewards/belief_accuracy/std": 0.0, "reward": 1.3711457252502441, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.6032420992851257, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2325, "step": 279 }, { "loss": 0.0174, "grad_norm": 4.0654096603393555, "learning_rate": 1.6805555555555558e-05, "num_tokens": 666661.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.0341999530792236, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3465277850627899, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 2.640883445739746, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.43393784016370773, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23333333333333334, "step": 280 }, { "loss": 0.0239, "grad_norm": 4.75065279006958, "learning_rate": 1.6666666666666667e-05, "num_tokens": 669102.0, "completions/mean_length": 10.25, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.046300001442432404, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2916484773159027, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 0.99439537525177, "reward_std": 0.055555522441864014, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.5964707285165787, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23416666666666666, "step": 281 }, { "loss": 0.0213, "grad_norm": 4.148177623748779, "learning_rate": 1.6527777777777777e-05, "num_tokens": 670830.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.4350999593734741, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.19624130427837372, "rewards/belief_accuracy/std": 0.08000914752483368, "reward": 2.7913737297058105, "reward_std": 0.24002742767333984, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5318765938282013, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.235, "step": 282 }, { "loss": 0.0228, "grad_norm": 3.6992576122283936, "learning_rate": 1.638888888888889e-05, "num_tokens": 673270.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.15850000083446503, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2708725929260254, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.100367784500122, "reward_std": 0.06415006518363953, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5698364973068237, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23583333333333334, "step": 283 }, { "loss": 0.0166, "grad_norm": 3.40875244140625, "learning_rate": 1.6250000000000002e-05, "num_tokens": 675598.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.35032498836517334, "rewards/env_reward/std": 0.05494999885559082, "rewards/belief_accuracy/mean": 0.32056111097335815, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.5371708869934082, "reward_std": 0.02686941623687744, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.41486937925219536, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23666666666666666, "step": 284 }, { "loss": 0.0182, "grad_norm": 2.8202176094055176, "learning_rate": 1.6111111111111115e-05, "num_tokens": 678038.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.0195000171661377, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.33726853132247925, "rewards/belief_accuracy/std": 0.02138333022594452, "reward": 2.5910556316375732, "reward_std": 0.06415006518363953, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.45439669489860535, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2375, "step": 285 }, { "loss": 0.0117, "grad_norm": 0.09877491742372513, "learning_rate": 1.597222222222222e-05, "num_tokens": 680478.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.47510001063346863, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.461570680141449, "rewards/belief_accuracy/std": 0.0, "reward": 2.147361993789673, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.2931655943393707, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23833333333333334, "step": 286 }, { "loss": 0.018, "grad_norm": 4.7789435386657715, "learning_rate": 1.5833333333333333e-05, "num_tokens": 682918.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1363999992609024, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4030686020851135, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 1.4638057947158813, "reward_std": 0.055555541068315506, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4500267207622528, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23916666666666667, "step": 287 }, { "loss": 0.011, "grad_norm": 2.197296380996704, "learning_rate": 1.5694444444444446e-05, "num_tokens": 685358.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7249000072479248, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2554989457130432, "rewards/belief_accuracy/std": 0.021383339539170265, "reward": 1.9038467407226562, "reward_std": 0.06415006518363953, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2741189934313297, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24, "step": 288 }, { "loss": 0.0504, "grad_norm": 13.399361610412598, "learning_rate": 1.5555555555555555e-05, "num_tokens": 687804.0, "completions/mean_length": 11.5, "completions/min_length": 10.0, "completions/max_length": 16.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 16.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.11860000342130661, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2638707160949707, "rewards/belief_accuracy/std": 0.0, "reward": 1.0195121765136719, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 16.0, "kl": 1.2592113390564919, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24083333333333334, "step": 289 }, { "loss": 0.0181, "grad_norm": 4.401183605194092, "learning_rate": 1.5416666666666668e-05, "num_tokens": 690250.0, "completions/mean_length": 11.5, "completions/min_length": 10.0, "completions/max_length": 16.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 16.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5791000127792358, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.30634474754333496, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.837684154510498, "reward_std": 0.06414999067783356, "frac_reward_zero_std": 0.0, "completion_length": 16.0, "kl": 0.4531232714653015, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24166666666666667, "step": 290 }, { "loss": 0.0429, "grad_norm": 3.968140125274658, "learning_rate": 1.527777777777778e-05, "num_tokens": 692720.0, "completions/mean_length": 17.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 12.666666984558105, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.1729000061750412, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3014543652534485, "rewards/belief_accuracy/std": 0.03703703731298447, "reward": 0.6950130462646484, "reward_std": 0.1111110970377922, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 1.0730007663369179, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2425, "step": 291 }, { "loss": 0.0174, "grad_norm": 3.9793334007263184, "learning_rate": 1.5138888888888888e-05, "num_tokens": 695164.0, "completions/mean_length": 11.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.27799999713897705, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3198787569999695, "rewards/belief_accuracy/std": 0.10029677301645279, "reward": 1.4266362190246582, "reward_std": 0.30089032649993896, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 0.43434225767850876, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24333333333333335, "step": 292 }, { "loss": 0.0143, "grad_norm": 32.97469711303711, "learning_rate": 1.5e-05, "num_tokens": 697626.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.059700001031160355, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2282104194164276, "rewards/belief_accuracy/std": 0.03817690536379814, "reward": 0.824181318283081, "reward_std": 0.11453071981668472, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.35747910663485527, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24416666666666667, "step": 293 }, { "loss": 0.0346, "grad_norm": 2.7478795051574707, "learning_rate": 1.4861111111111111e-05, "num_tokens": 700066.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3815999925136566, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3802044987678528, "rewards/belief_accuracy/std": 0.08288891613483429, "reward": 1.763013482093811, "reward_std": 0.24866671860218048, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.8649156019091606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.245, "step": 294 }, { "loss": 0.0165, "grad_norm": 5.457376956939697, "learning_rate": 1.4722222222222224e-05, "num_tokens": 702550.0, "completions/mean_length": 21.0, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.5, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.20010000467300415, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.39694103598594666, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 0.9406732320785522, "reward_std": 0.055555522441864014, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.4137156940996647, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24583333333333332, "step": 295 }, { "loss": 0.0173, "grad_norm": 3.0129573345184326, "learning_rate": 1.4583333333333335e-05, "num_tokens": 704990.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.35929998755455017, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.41454631090164185, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.8325889110565186, "reward_std": 0.06415006518363953, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.43357478082180023, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24666666666666667, "step": 296 }, { "loss": 0.0168, "grad_norm": 13.825785636901855, "learning_rate": 1.4444444444444444e-05, "num_tokens": 707466.0, "completions/mean_length": 19.0, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 14.666666984558105, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 24.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3904249966144562, "rewards/env_reward/std": 0.12855000793933868, "rewards/belief_accuracy/mean": 0.3586101531982422, "rewards/belief_accuracy/std": 0.011781362816691399, "reward": 1.711467981338501, "reward_std": 0.2151748389005661, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.4190758764743805, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2475, "step": 297 }, { "loss": 0.0221, "grad_norm": 2.5629937648773193, "learning_rate": 1.4305555555555555e-05, "num_tokens": 709906.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1729000061750412, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.34271708130836487, "rewards/belief_accuracy/std": 0.008270323276519775, "reward": 1.3375012874603271, "reward_std": 0.024810949340462685, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5514194816350937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24833333333333332, "step": 298 }, { "loss": 0.016, "grad_norm": 8.018586158752441, "learning_rate": 1.4166666666666668e-05, "num_tokens": 712368.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4747999906539917, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.436899334192276, "rewards/belief_accuracy/std": 0.026089724153280258, "reward": 2.0728981494903564, "reward_std": 0.07826922833919525, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.4000145494937897, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24916666666666668, "step": 299 }, { "loss": 0.0159, "grad_norm": 2.438758373260498, "learning_rate": 1.4027777777777779e-05, "num_tokens": 714808.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.036400001496076584, "rewards/env_reward/std": 0.01100000087171793, "rewards/belief_accuracy/mean": 0.31399524211883545, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 1.0465856790542603, "reward_std": 0.06300617754459381, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3966354578733444, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.25, "step": 300 }, { "loss": 0.0188, "grad_norm": 2.277829170227051, "learning_rate": 1.388888888888889e-05, "num_tokens": 717248.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.03530000150203705, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.339000940322876, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.1199527978897095, "reward_std": 0.06415006518363953, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.46993373334407806, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.25083333333333335, "step": 301 }, { "loss": 0.0193, "grad_norm": 0.5459198951721191, "learning_rate": 1.3750000000000002e-05, "num_tokens": 719692.0, "completions/mean_length": 11.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.25440001487731934, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.36154788732528687, "rewards/belief_accuracy/std": 0.0, "reward": 1.5162436962127686, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 14.0, "kl": 0.4824730008840561, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.25166666666666665, "step": 302 }, { "loss": 0.0185, "grad_norm": 2.4194753170013428, "learning_rate": 1.3611111111111111e-05, "num_tokens": 721652.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6984999775886536, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2708725929260254, "rewards/belief_accuracy/std": 0.06415002793073654, "reward": 1.910367727279663, "reward_std": 0.192450150847435, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4619780480861664, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2525, "step": 303 }, { "loss": 0.0184, "grad_norm": 0.019419284537434578, "learning_rate": 1.3472222222222222e-05, "num_tokens": 724092.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.10329999774694443, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.338581919670105, "rewards/belief_accuracy/std": 0.0, "reward": 1.2206957340240479, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.45894408226013184, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.25333333333333335, "step": 304 }, { "loss": 0.025, "grad_norm": 2.1239161491394043, "learning_rate": 1.3333333333333333e-05, "num_tokens": 726554.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3213750123977661, "rewards/env_reward/std": 0.04535000026226044, "rewards/belief_accuracy/mean": 0.33021634817123413, "rewards/belief_accuracy/std": 0.03546027094125748, "reward": 1.5227115154266357, "reward_std": 0.11586559563875198, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.6261979192495346, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.25416666666666665, "step": 305 }, { "loss": 0.025, "grad_norm": 194.4773406982422, "learning_rate": 1.3194444444444446e-05, "num_tokens": 729009.0, "completions/mean_length": 13.75, "completions/min_length": 10.0, "completions/max_length": 25.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 25.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.054099999368190765, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3904189467430115, "rewards/belief_accuracy/std": 0.011587768793106079, "reward": 1.3024067878723145, "reward_std": 0.034763336181640625, "frac_reward_zero_std": 0.0, "completion_length": 25.0, "kl": 0.6245425343513489, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.255, "step": 306 }, { "loss": 0.0136, "grad_norm": 3.1589040756225586, "learning_rate": 1.3055555555555557e-05, "num_tokens": 731449.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.23800000548362732, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2708725929260254, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.2196178436279297, "reward_std": 0.06414999067783356, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.33902474492788315, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.25583333333333336, "step": 307 }, { "loss": 0.0263, "grad_norm": 2.6437861919403076, "learning_rate": 1.2916666666666668e-05, "num_tokens": 733749.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.20419999957084656, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3557646870613098, "rewards/belief_accuracy/std": 0.02433657832443714, "reward": 1.4235941171646118, "reward_std": 0.07300972193479538, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.6563503816723824, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.25666666666666665, "step": 308 }, { "loss": 0.0172, "grad_norm": 0.04555347189307213, "learning_rate": 1.2777777777777777e-05, "num_tokens": 736189.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1460999995470047, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.44499048590660095, "rewards/belief_accuracy/std": 0.0, "reward": 1.604121446609497, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.42974987626075745, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2575, "step": 309 }, { "loss": 0.0315, "grad_norm": 0.02474827878177166, "learning_rate": 1.263888888888889e-05, "num_tokens": 738629.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.017999999225139618, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4330648183822632, "rewards/belief_accuracy/std": 0.0, "reward": 1.3761944770812988, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.7871623039245605, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.25833333333333336, "step": 310 }, { "loss": 0.0141, "grad_norm": 3.0570595264434814, "learning_rate": 1.25e-05, "num_tokens": 741069.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.16259999573230743, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2708725929260254, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.1065177917480469, "reward_std": 0.06415006518363953, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3533485382795334, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.25916666666666666, "step": 311 }, { "loss": 0.0271, "grad_norm": 139.93460083007812, "learning_rate": 1.2361111111111112e-05, "num_tokens": 743531.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5424000024795532, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2697893977165222, "rewards/belief_accuracy/std": 0.0277777761220932, "reward": 1.6729682683944702, "reward_std": 0.08333329111337662, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.678299218416214, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.26, "step": 312 }, { "loss": 0.0165, "grad_norm": 2.2570526599884033, "learning_rate": 1.2222222222222222e-05, "num_tokens": 745971.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7261999845504761, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3459707498550415, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 2.1772122383117676, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4118284657597542, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2608333333333333, "step": 313 }, { "loss": 0.0153, "grad_norm": 3.618257761001587, "learning_rate": 1.2083333333333333e-05, "num_tokens": 748412.0, "completions/mean_length": 10.25, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4198000133037567, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.38340792059898376, "rewards/belief_accuracy/std": 0.03252057731151581, "reward": 1.8299237489700317, "reward_std": 0.09756175428628922, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.38145381957292557, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.26166666666666666, "step": 314 }, { "loss": 0.0216, "grad_norm": 10.187060356140137, "learning_rate": 1.1944444444444446e-05, "num_tokens": 750546.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.2267999649047852, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.261056125164032, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 2.673368453979492, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.5387748070061207, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2625, "step": 315 }, { "loss": 0.024, "grad_norm": 1.7571438550949097, "learning_rate": 1.1805555555555555e-05, "num_tokens": 752986.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.0868000015616417, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.37947753071784973, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 1.0582326650619507, "reward_std": 0.055555541068315506, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5994225069880486, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2633333333333333, "step": 316 }, { "loss": 0.0241, "grad_norm": 0.014529568143188953, "learning_rate": 1.1666666666666668e-05, "num_tokens": 755426.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1817999929189682, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.37021827697753906, "rewards/belief_accuracy/std": 0.0, "reward": 1.4333548545837402, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.601597785949707, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.26416666666666666, "step": 317 }, { "loss": 0.0309, "grad_norm": 2.389315366744995, "learning_rate": 1.1527777777777779e-05, "num_tokens": 757154.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.7894999980926514, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.408660352230072, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 3.960230827331543, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.7737997323274612, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.265, "step": 318 }, { "loss": 0.0479, "grad_norm": 3.118506669998169, "learning_rate": 1.138888888888889e-05, "num_tokens": 759594.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.23199999332427979, "rewards/env_reward/std": 0.07039999961853027, "rewards/belief_accuracy/mean": 0.35916271805763245, "rewards/belief_accuracy/std": 0.020031657069921494, "reward": 1.4754880666732788, "reward_std": 0.1484978199005127, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 1.1970821470022202, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2658333333333333, "step": 319 }, { "loss": 0.0259, "grad_norm": 0.015209314413368702, "learning_rate": 1.125e-05, "num_tokens": 762034.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.11079999804496765, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3922864496707916, "rewards/belief_accuracy/std": 0.0, "reward": 1.3930593729019165, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.6464384198188782, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.26666666666666666, "step": 320 }, { "loss": 0.044, "grad_norm": 161.41494750976562, "learning_rate": 1.1111111111111112e-05, "num_tokens": 764483.0, "completions/mean_length": 12.25, "completions/min_length": 10.0, "completions/max_length": 19.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 12.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 19.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.45329999923706055, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3586101531982422, "rewards/belief_accuracy/std": 0.011781362816691399, "reward": 1.8057804107666016, "reward_std": 0.03534410521388054, "frac_reward_zero_std": 0.0, "completion_length": 19.0, "kl": 1.1002038344740868, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2675, "step": 321 }, { "loss": 0.0161, "grad_norm": 0.044717781245708466, "learning_rate": 1.0972222222222223e-05, "num_tokens": 766923.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.29429998993873596, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.42471882700920105, "rewards/belief_accuracy/std": 0.0, "reward": 1.76560640335083, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.40166932344436646, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2683333333333333, "step": 322 }, { "loss": 0.0179, "grad_norm": 2.5786283016204834, "learning_rate": 1.0833333333333334e-05, "num_tokens": 769365.0, "completions/mean_length": 10.5, "completions/min_length": 10.0, "completions/max_length": 12.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 12.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.16269999742507935, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.40731751918792725, "rewards/belief_accuracy/std": 0.010020703077316284, "reward": 1.5160024166107178, "reward_std": 0.030062079429626465, "frac_reward_zero_std": 0.0, "completion_length": 12.0, "kl": 0.4470795914530754, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.26916666666666667, "step": 323 }, { "loss": 0.0265, "grad_norm": 1.7102688550949097, "learning_rate": 1.0694444444444444e-05, "num_tokens": 771477.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.35530000925064087, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.34043174982070923, "rewards/belief_accuracy/std": 0.011708259582519531, "reward": 1.6042454242706299, "reward_std": 0.035124778747558594, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.6622214764356613, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.27, "step": 324 }, { "loss": 0.0144, "grad_norm": 2.304865837097168, "learning_rate": 1.0555555555555555e-05, "num_tokens": 773917.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1941000074148178, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4178144931793213, "rewards/belief_accuracy/std": 0.016261577606201172, "reward": 1.5945935249328613, "reward_std": 0.048784732818603516, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3590489625930786, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2708333333333333, "step": 325 }, { "loss": 0.0175, "grad_norm": 1.4902801513671875, "learning_rate": 1.0416666666666668e-05, "num_tokens": 775862.0, "completions/mean_length": 13.25, "completions/min_length": 10.0, "completions/max_length": 23.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 23.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6675999760627747, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4199703335762024, "rewards/belief_accuracy/std": 0.016849856823682785, "reward": 2.3113110065460205, "reward_std": 0.05054942145943642, "frac_reward_zero_std": 0.0, "completion_length": 23.0, "kl": 0.4370143413543701, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.27166666666666667, "step": 326 }, { "loss": 0.0305, "grad_norm": 2.2359626293182373, "learning_rate": 1.0277777777777777e-05, "num_tokens": 778302.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.02942500077188015, "rewards/env_reward/std": 0.05795000120997429, "rewards/belief_accuracy/mean": 0.24623969197273254, "rewards/belief_accuracy/std": 0.018518514931201935, "reward": 0.8328565359115601, "reward_std": 0.11773625016212463, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.7615313455462456, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2725, "step": 327 }, { "loss": 0.0132, "grad_norm": 2.020049810409546, "learning_rate": 1.013888888888889e-05, "num_tokens": 780742.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.8956000208854675, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.27441903948783875, "rewards/belief_accuracy/std": 0.02138333022594452, "reward": 2.2166571617126465, "reward_std": 0.06414992362260818, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3303503468632698, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2733333333333333, "step": 328 }, { "loss": 0.0334, "grad_norm": 2.1073155403137207, "learning_rate": 1e-05, "num_tokens": 783182.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.34060001373291016, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.265547513961792, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.357542634010315, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.8350988179445267, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.27416666666666667, "step": 329 }, { "loss": 0.0171, "grad_norm": 2.754615306854248, "learning_rate": 9.861111111111112e-06, "num_tokens": 785622.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.17589999735355377, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3334578275680542, "rewards/belief_accuracy/std": 0.022659126669168472, "reward": 1.3142235279083252, "reward_std": 0.0679774284362793, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.42683811113238335, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.275, "step": 330 }, { "loss": 0.0211, "grad_norm": 2.232966899871826, "learning_rate": 9.722222222222223e-06, "num_tokens": 788062.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.41600000858306885, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2752085030078888, "rewards/belief_accuracy/std": 0.0002678185701370239, "reward": 1.4996254444122314, "reward_std": 0.0008034308557398617, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5274177491664886, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2758333333333333, "step": 331 }, { "loss": 0.0225, "grad_norm": 6.217159271240234, "learning_rate": 9.583333333333334e-06, "num_tokens": 790524.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6958000063896179, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.33671149611473083, "rewards/belief_accuracy/std": 0.02138333022594452, "reward": 2.103834390640259, "reward_std": 0.06415006518363953, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.5627310574054718, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.27666666666666667, "step": 332 }, { "loss": 0.0232, "grad_norm": 0.021253760904073715, "learning_rate": 9.444444444444445e-06, "num_tokens": 792964.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6172999739646912, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4087548851966858, "rewards/belief_accuracy/std": 0.0, "reward": 2.2022147178649902, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5807605981826782, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2775, "step": 333 }, { "loss": 0.0179, "grad_norm": 21.313928604125977, "learning_rate": 9.305555555555555e-06, "num_tokens": 795448.0, "completions/mean_length": 21.0, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.5, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6018999814987183, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.27441903948783875, "rewards/belief_accuracy/std": 0.02138333022594452, "reward": 1.7761070728302002, "reward_std": 0.06414999067783356, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.4485930949449539, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2783333333333333, "step": 334 }, { "loss": 0.0172, "grad_norm": 1.7311512231826782, "learning_rate": 9.166666666666666e-06, "num_tokens": 797888.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.13109999895095825, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3684219717979431, "rewards/belief_accuracy/std": 0.01599299907684326, "reward": 1.3519158363342285, "reward_std": 0.047978997230529785, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.43019605800509453, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2791666666666667, "step": 335 }, { "loss": 0.0143, "grad_norm": 2.104975461959839, "learning_rate": 9.027777777777777e-06, "num_tokens": 800328.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.46470001339912415, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3800663948059082, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.887249231338501, "reward_std": 0.06415006518363953, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.35861632227897644, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.28, "step": 336 }, { "loss": 0.0185, "grad_norm": 0.03680419921875, "learning_rate": 8.88888888888889e-06, "num_tokens": 802768.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.07029999792575836, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.32095709443092346, "rewards/belief_accuracy/std": 0.0, "reward": 1.1183212995529175, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.4628690183162689, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2808333333333333, "step": 337 }, { "loss": 0.0137, "grad_norm": 1.9502252340316772, "learning_rate": 8.75e-06, "num_tokens": 805208.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.9326000213623047, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2651597857475281, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 2.2443795204162598, "reward_std": 0.05555550381541252, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3435991369187832, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2816666666666667, "step": 338 }, { "loss": 0.0323, "grad_norm": 48.597938537597656, "learning_rate": 8.611111111111112e-06, "num_tokens": 807670.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.24860000610351562, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3153432607650757, "rewards/belief_accuracy/std": 0.04375755414366722, "reward": 1.3689297437667847, "reward_std": 0.13127261400222778, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.8074730969965458, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2825, "step": 339 }, { "loss": 0.0137, "grad_norm": 3.296276092529297, "learning_rate": 8.472222222222223e-06, "num_tokens": 810110.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.1817999929189682, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.39799603819847107, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 1.5166881084442139, "reward_std": 0.05555550381541252, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.34323835372924805, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2833333333333333, "step": 340 }, { "loss": 0.0202, "grad_norm": 0.026873953640460968, "learning_rate": 8.333333333333334e-06, "num_tokens": 812550.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.24779999256134033, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3939528167247772, "rewards/belief_accuracy/std": 0.0, "reward": 1.6035584211349487, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5060014128684998, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2841666666666667, "step": 341 }, { "loss": 0.0191, "grad_norm": 1.4969699382781982, "learning_rate": 8.194444444444445e-06, "num_tokens": 814990.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.05469999834895134, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3531047999858856, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 1.1913644075393677, "reward_std": 0.055555541068315506, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.47678476572036743, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.285, "step": 342 }, { "loss": 0.0147, "grad_norm": 2.3507494926452637, "learning_rate": 8.055555555555557e-06, "num_tokens": 816918.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7950999736785889, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3493489623069763, "rewards/belief_accuracy/std": 0.030240608379244804, "reward": 2.290696620941162, "reward_std": 0.0907217413187027, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3678537532687187, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.28583333333333333, "step": 343 }, { "loss": 0.0145, "grad_norm": 2.588015556335449, "learning_rate": 7.916666666666667e-06, "num_tokens": 819358.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4438000023365021, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4264719784259796, "rewards/belief_accuracy/std": 0.02138333022594452, "reward": 1.9951159954071045, "reward_std": 0.06415006518363953, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3631714880466461, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2866666666666667, "step": 344 }, { "loss": 0.0176, "grad_norm": 7.8612518310546875, "learning_rate": 7.777777777777777e-06, "num_tokens": 821819.0, "completions/mean_length": 15.25, "completions/min_length": 10.0, "completions/max_length": 31.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 15.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 31.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.529699981212616, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.32745224237442017, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 1.826906681060791, "reward_std": 0.05555546283721924, "frac_reward_zero_std": 0.0, "completion_length": 31.0, "kl": 0.43969496712088585, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2875, "step": 345 }, { "loss": 0.0149, "grad_norm": 0.2849005162715912, "learning_rate": 7.63888888888889e-06, "num_tokens": 824264.0, "completions/mean_length": 11.25, "completions/min_length": 10.0, "completions/max_length": 15.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 15.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.05490000173449516, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3253270387649536, "rewards/belief_accuracy/std": 0.0, "reward": 1.1083310842514038, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 15.0, "kl": 0.3736693486571312, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.28833333333333333, "step": 346 }, { "loss": 0.0167, "grad_norm": 2.451597213745117, "learning_rate": 7.5e-06, "num_tokens": 826706.0, "completions/mean_length": 10.5, "completions/min_length": 10.0, "completions/max_length": 12.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 12.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.33180001378059387, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2972326874732971, "rewards/belief_accuracy/std": 0.0274982750415802, "reward": 1.4393980503082275, "reward_std": 0.0824948102235794, "frac_reward_zero_std": 0.0, "completion_length": 12.0, "kl": 0.4166445955634117, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2891666666666667, "step": 347 }, { "loss": 0.0156, "grad_norm": 2.6531128883361816, "learning_rate": 7.361111111111112e-06, "num_tokens": 829146.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2734000086784363, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.35287201404571533, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.5187160968780518, "reward_std": 0.06414999067783356, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3893243670463562, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.29, "step": 348 }, { "loss": 0.0146, "grad_norm": 6.793511867523193, "learning_rate": 7.222222222222222e-06, "num_tokens": 831591.0, "completions/mean_length": 11.25, "completions/min_length": 10.0, "completions/max_length": 15.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 15.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.18289999663829803, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3153432607650757, "rewards/belief_accuracy/std": 0.04375755414366722, "reward": 1.270379662513733, "reward_std": 0.13127264380455017, "frac_reward_zero_std": 0.0, "completion_length": 15.0, "kl": 0.36402036249637604, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.29083333333333333, "step": 349 }, { "loss": 0.01, "grad_norm": 14.456671714782715, "learning_rate": 7.083333333333334e-06, "num_tokens": 834036.0, "completions/mean_length": 11.25, "completions/min_length": 10.0, "completions/max_length": 15.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 15.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.0419000387191772, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3418981432914734, "rewards/belief_accuracy/std": 0.0277777761220932, "reward": 2.6385445594787598, "reward_std": 0.08333325386047363, "frac_reward_zero_std": 0.0, "completion_length": 15.0, "kl": 0.25079457089304924, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2916666666666667, "step": 350 }, { "loss": 0.0127, "grad_norm": 2.2002947330474854, "learning_rate": 6.944444444444445e-06, "num_tokens": 836476.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3596999943256378, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.38932567834854126, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.7575269937515259, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.31729260832071304, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2925, "step": 351 }, { "loss": 0.0288, "grad_norm": 3.2136647701263428, "learning_rate": 6.805555555555556e-06, "num_tokens": 838916.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.04879999905824661, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.273129940032959, "rewards/belief_accuracy/std": 0.055555544793605804, "reward": 0.9425899386405945, "reward_std": 0.166666641831398, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.7196188867092133, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.29333333333333333, "step": 352 }, { "loss": 0.0176, "grad_norm": 2.321444034576416, "learning_rate": 6.666666666666667e-06, "num_tokens": 841360.0, "completions/mean_length": 11.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4609000086784363, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.39598548412323, "rewards/belief_accuracy/std": 0.026168476790189743, "reward": 1.9293063879013062, "reward_std": 0.07850543409585953, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 0.44010917842388153, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2941666666666667, "step": 353 }, { "loss": 0.0189, "grad_norm": 0.029545234516263008, "learning_rate": 6.5277777777777784e-06, "num_tokens": 843800.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.48510000109672546, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.28782621026039124, "rewards/belief_accuracy/std": 0.0, "reward": 1.641128659248352, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.4735727906227112, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.295, "step": 354 }, { "loss": 0.0103, "grad_norm": 2.6239774227142334, "learning_rate": 6.3888888888888885e-06, "num_tokens": 846240.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.41990000009536743, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4032561779022217, "rewards/belief_accuracy/std": 0.013788819313049316, "reward": 1.8896186351776123, "reward_std": 0.04136645793914795, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.25653597339987755, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.29583333333333334, "step": 355 }, { "loss": 0.0155, "grad_norm": 9.297626495361328, "learning_rate": 6.25e-06, "num_tokens": 848566.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.05260000005364418, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.38377392292022705, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.280221700668335, "reward_std": 0.06414999067783356, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.3872426562011242, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2966666666666667, "step": 356 }, { "loss": 0.0161, "grad_norm": 1.9092109203338623, "learning_rate": 6.111111111111111e-06, "num_tokens": 851006.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.3142000138759613, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2835240662097931, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.3718723058700562, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.4036319889128208, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2975, "step": 357 }, { "loss": 0.0209, "grad_norm": 0.07863418012857437, "learning_rate": 5.972222222222223e-06, "num_tokens": 853122.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5867999792098999, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3187500238418579, "rewards/belief_accuracy/std": 0.0, "reward": 1.8864500522613525, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.5231885313987732, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.29833333333333334, "step": 358 }, { "loss": 0.0147, "grad_norm": 2.9344420433044434, "learning_rate": 5.833333333333334e-06, "num_tokens": 855562.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4784500002861023, "rewards/env_reward/std": 0.049500007182359695, "rewards/belief_accuracy/mean": 0.2708725929260254, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.580292820930481, "reward_std": 0.12299706041812897, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3685489371418953, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2991666666666667, "step": 359 }, { "loss": 0.012, "grad_norm": 0.04218378663063049, "learning_rate": 5.694444444444445e-06, "num_tokens": 858002.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.10130000114440918, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3579941391944885, "rewards/belief_accuracy/std": 0.0, "reward": 1.2759324312210083, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.30064401030540466, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3, "step": 360 }, { "loss": 0.0245, "grad_norm": 0.05267792195081711, "learning_rate": 5.555555555555556e-06, "num_tokens": 860442.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.26089999079704285, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.35569527745246887, "rewards/belief_accuracy/std": 0.0, "reward": 0.7257359027862549, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.6117653846740723, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.30083333333333334, "step": 361 }, { "loss": 0.1643, "grad_norm": 12.316222190856934, "learning_rate": 5.416666666666667e-06, "num_tokens": 862554.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4629000127315521, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3517557978630066, "rewards/belief_accuracy/std": 0.00890129804611206, "reward": 1.7996174097061157, "reward_std": 0.026703864336013794, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 4.106600508093834, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3016666666666667, "step": 362 }, { "loss": 0.015, "grad_norm": 1.9306823015213013, "learning_rate": 5.277777777777778e-06, "num_tokens": 864994.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.41449999809265137, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4363061785697937, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 1.980668544769287, "reward_std": 0.06415006518363953, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3761134594678879, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3025, "step": 363 }, { "loss": 0.021, "grad_norm": 2.6175270080566406, "learning_rate": 5.138888888888889e-06, "num_tokens": 867434.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.12759999930858612, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.27312996983528137, "rewards/belief_accuracy/std": 0.03546025604009628, "reward": 1.060789942741394, "reward_std": 0.10638077557086945, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.523816742002964, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.30333333333333334, "step": 364 }, { "loss": 0.027, "grad_norm": 1.343258023262024, "learning_rate": 5e-06, "num_tokens": 869874.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.05169999971985817, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3684219717979431, "rewards/belief_accuracy/std": 0.01599299907684326, "reward": 1.2328159809112549, "reward_std": 0.047978997230529785, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.6741083934903145, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.30416666666666664, "step": 365 }, { "loss": 0.0208, "grad_norm": 2.257349729537964, "learning_rate": 4.861111111111111e-06, "num_tokens": 872314.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.06369999796152115, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.39694103598594666, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.145273208618164, "reward_std": 0.05555550381541252, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5208408161997795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.305, "step": 366 }, { "loss": 0.0106, "grad_norm": 0.8866859674453735, "learning_rate": 4.722222222222222e-06, "num_tokens": 874757.0, "completions/mean_length": 10.75, "completions/min_length": 10.0, "completions/max_length": 13.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 13.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.08009999990463257, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3579941391944885, "rewards/belief_accuracy/std": 0.0, "reward": 1.2441325187683105, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 13.0, "kl": 0.26471298933029175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.30583333333333335, "step": 367 }, { "loss": 0.1229, "grad_norm": 8.010123252868652, "learning_rate": 4.583333333333333e-06, "num_tokens": 877198.0, "completions/mean_length": 10.25, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/env_reward/mean": -0.8808000087738037, "rewards/env_reward/std": 1.4128000736236572, "rewards/belief_accuracy/mean": 0.1363677680492401, "rewards/belief_accuracy/std": 0.22492384910583496, "reward": -0.9120967388153076, "reward_std": 2.8924098014831543, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 3.0728435292840004, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.30666666666666664, "step": 368 }, { "loss": 0.0196, "grad_norm": 4.981584072113037, "learning_rate": 4.444444444444445e-06, "num_tokens": 879643.0, "completions/mean_length": 11.25, "completions/min_length": 10.0, "completions/max_length": 15.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 15.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.43689998984336853, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.290595680475235, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 1.5771369934082031, "reward_std": 0.05555550381541252, "frac_reward_zero_std": 0.0, "completion_length": 15.0, "kl": 0.488948717713356, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3075, "step": 369 }, { "loss": 0.015, "grad_norm": 2.590527057647705, "learning_rate": 4.305555555555556e-06, "num_tokens": 882083.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5357999801635742, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3345862925052643, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 1.8574588298797607, "reward_std": 0.05555546283721924, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.37513425201177597, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.30833333333333335, "step": 370 }, { "loss": 0.0156, "grad_norm": 2.3306689262390137, "learning_rate": 4.166666666666667e-06, "num_tokens": 884523.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4943999946117401, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4608090817928314, "rewards/belief_accuracy/std": 0.0015231966972351074, "reward": 2.17402720451355, "reward_std": 0.004569530487060547, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3898431584239006, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.30916666666666665, "step": 371 }, { "loss": 0.0201, "grad_norm": 18.126691818237305, "learning_rate": 4.027777777777779e-06, "num_tokens": 886988.0, "completions/mean_length": 16.25, "completions/min_length": 10.0, "completions/max_length": 29.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 16.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 29.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.9989500045776367, "rewards/env_reward/std": 0.26506149768829346, "rewards/belief_accuracy/mean": 0.32745224237442017, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 2.5307817459106445, "reward_std": 0.4320550858974457, "frac_reward_zero_std": 0.0, "completion_length": 29.0, "kl": 0.5024303831160069, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.31, "step": 372 }, { "loss": 0.0138, "grad_norm": 2.256619930267334, "learning_rate": 3.888888888888889e-06, "num_tokens": 889428.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.021199999377131462, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.38676851987838745, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.242105484008789, "reward_std": 0.05555550381541252, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3448418974876404, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.31083333333333335, "step": 373 }, { "loss": 0.011, "grad_norm": 3.003289222717285, "learning_rate": 3.75e-06, "num_tokens": 891868.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.062199998646974564, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.20506228506565094, "rewards/belief_accuracy/std": 0.02138334885239601, "reward": 0.7584868669509888, "reward_std": 0.06415002793073654, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.2741120904684067, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.31166666666666665, "step": 374 }, { "loss": 0.0371, "grad_norm": 10.024889945983887, "learning_rate": 3.611111111111111e-06, "num_tokens": 894330.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.16060000658035278, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3431697189807892, "rewards/belief_accuracy/std": 0.00252552330493927, "reward": 1.3204090595245361, "reward_std": 0.007576584815979004, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.926845133304596, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3125, "step": 375 }, { "loss": 0.0172, "grad_norm": 0.01764746382832527, "learning_rate": 3.4722222222222224e-06, "num_tokens": 896770.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2800000011920929, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3960277736186981, "rewards/belief_accuracy/std": 0.0, "reward": 1.6580833196640015, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.43021830916404724, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.31333333333333335, "step": 376 }, { "loss": 0.0212, "grad_norm": 5.265389919281006, "learning_rate": 3.3333333333333333e-06, "num_tokens": 899238.0, "completions/mean_length": 17.0, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 12.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 16.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.20217500627040863, "rewards/env_reward/std": 0.28334999084472656, "rewards/belief_accuracy/mean": 0.4355999231338501, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.660062313079834, "reward_std": 0.4098671078681946, "frac_reward_zero_std": 0.0, "completion_length": 32.0, "kl": 0.529280386865139, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.31416666666666665, "step": 377 }, { "loss": 0.0254, "grad_norm": 1.52178955078125, "learning_rate": 3.1944444444444443e-06, "num_tokens": 901693.0, "completions/mean_length": 13.75, "completions/min_length": 10.0, "completions/max_length": 25.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 13.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 25.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.032999999821186066, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4471418261528015, "rewards/belief_accuracy/std": 0.015365764498710632, "reward": 1.4409255981445312, "reward_std": 0.046097397804260254, "frac_reward_zero_std": 0.0, "completion_length": 25.0, "kl": 0.6352234184741974, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.315, "step": 378 }, { "loss": 0.0235, "grad_norm": 0.8627490997314453, "learning_rate": 3.0555555555555556e-06, "num_tokens": 904149.0, "completions/mean_length": 14.0, "completions/min_length": 10.0, "completions/max_length": 26.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 26.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4296000003814697, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2753424048423767, "rewards/belief_accuracy/std": 0.0, "reward": 1.5204272270202637, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 26.0, "kl": 0.5881659425795078, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.31583333333333335, "step": 379 }, { "loss": 0.0142, "grad_norm": 2.6827125549316406, "learning_rate": 2.916666666666667e-06, "num_tokens": 906589.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.04839999973773956, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4228193163871765, "rewards/belief_accuracy/std": 0.007042735815048218, "reward": 1.3910578489303589, "reward_std": 0.021128177642822266, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.35598936676979065, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.31666666666666665, "step": 380 }, { "loss": 0.0231, "grad_norm": 2.160460948944092, "learning_rate": 2.777777777777778e-06, "num_tokens": 909029.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.07639999687671661, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.36097338795661926, "rewards/belief_accuracy/std": 0.010556221008300781, "reward": 1.2475202083587646, "reward_std": 0.031668663024902344, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.578317403793335, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3175, "step": 381 }, { "loss": 0.0188, "grad_norm": 4.413422107696533, "learning_rate": 2.638888888888889e-06, "num_tokens": 911474.0, "completions/mean_length": 11.25, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6983000040054321, "rewards/env_reward/std": 0.1550000011920929, "rewards/belief_accuracy/mean": 0.40389877557754517, "rewards/belief_accuracy/std": 0.0277777761220932, "reward": 2.3091464042663574, "reward_std": 0.31583333015441895, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 0.47100868076086044, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.31833333333333336, "step": 382 }, { "loss": 0.0175, "grad_norm": 5.446463584899902, "learning_rate": 2.5e-06, "num_tokens": 913932.0, "completions/mean_length": 14.5, "completions/min_length": 10.0, "completions/max_length": 28.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 14.5, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 28.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.7389000058174133, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2647581994533539, "rewards/belief_accuracy/std": 0.018518514931201935, "reward": 1.9526245594024658, "reward_std": 0.05555550381541252, "frac_reward_zero_std": 0.0, "completion_length": 28.0, "kl": 0.4363628067076206, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.31916666666666665, "step": 383 }, { "loss": 0.0212, "grad_norm": 2.315631151199341, "learning_rate": 2.361111111111111e-06, "num_tokens": 916372.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.16750000417232513, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.39694103598594666, "rewards/belief_accuracy/std": 0.03546025976538658, "reward": 0.9895731210708618, "reward_std": 0.10638080537319183, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5301463380455971, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.32, "step": 384 }, { "loss": 0.0181, "grad_norm": 2.120096445083618, "learning_rate": 2.2222222222222225e-06, "num_tokens": 918834.0, "completions/mean_length": 15.5, "completions/min_length": 10.0, "completions/max_length": 32.0, "completions/clipped_ratio": 0.25, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5238999724388123, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3253270387649536, "rewards/belief_accuracy/std": 0.0, "reward": 1.811830997467041, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 32.0, "kl": 0.45274005830287933, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.32083333333333336, "step": 385 }, { "loss": 0.0128, "grad_norm": 0.03539842367172241, "learning_rate": 2.0833333333333334e-06, "num_tokens": 921274.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.4812999963760376, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.40483367443084717, "rewards/belief_accuracy/std": 0.0, "reward": 1.9864510297775269, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.320296972990036, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.32166666666666666, "step": 386 }, { "loss": 0.0132, "grad_norm": 2.4435579776763916, "learning_rate": 1.9444444444444444e-06, "num_tokens": 923714.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.27720001339912415, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3661750555038452, "rewards/belief_accuracy/std": 0.018518507480621338, "reward": 1.564325213432312, "reward_std": 0.055555541068315506, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.33051633834838867, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3225, "step": 387 }, { "loss": 0.0245, "grad_norm": 3.437168836593628, "learning_rate": 1.8055555555555555e-06, "num_tokens": 926154.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.07639999687671661, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.36154788732528687, "rewards/belief_accuracy/std": 0.03024062141776085, "reward": 1.0200436115264893, "reward_std": 0.09072183817625046, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.6133978962898254, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3233333333333333, "step": 388 }, { "loss": 0.018, "grad_norm": 10.590235710144043, "learning_rate": 1.6666666666666667e-06, "num_tokens": 928601.0, "completions/mean_length": 11.75, "completions/min_length": 10.0, "completions/max_length": 17.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.75, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 17.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.0364999994635582, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4342842996120453, "rewards/belief_accuracy/std": 0.01588726043701172, "reward": 1.4076029062271118, "reward_std": 0.047661781311035156, "frac_reward_zero_std": 0.0, "completion_length": 17.0, "kl": 0.4510222151875496, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.32416666666666666, "step": 389 }, { "loss": 0.0124, "grad_norm": 2.6361474990844727, "learning_rate": 1.5277777777777778e-06, "num_tokens": 931041.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.6389999985694885, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.4608090817928314, "rewards/belief_accuracy/std": 0.0015231966972351074, "reward": 2.3909270763397217, "reward_std": 0.004569530487060547, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.3107151463627815, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.325, "step": 390 }, { "loss": 0.0254, "grad_norm": 3.5152220726013184, "learning_rate": 1.388888888888889e-06, "num_tokens": 933482.0, "completions/mean_length": 10.25, "completions/min_length": 10.0, "completions/max_length": 11.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.25, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 11.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.156700000166893, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.44561946392059326, "rewards/belief_accuracy/std": 0.0012579113245010376, "reward": 1.62190842628479, "reward_std": 0.003773768898099661, "frac_reward_zero_std": 0.0, "completion_length": 11.0, "kl": 0.6350157707929611, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3258333333333333, "step": 391 }, { "loss": 0.0556, "grad_norm": 1.8789347410202026, "learning_rate": 1.25e-06, "num_tokens": 935414.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.9753749966621399, "rewards/env_reward/std": 0.5192500352859497, "rewards/belief_accuracy/mean": 0.4274568557739258, "rewards/belief_accuracy/std": 0.032788295298814774, "reward": 2.7954330444335938, "reward_std": 0.8637241721153259, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 1.3899996429681778, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.32666666666666666, "step": 392 }, { "loss": 0.011, "grad_norm": 0.022810563445091248, "learning_rate": 1.1111111111111112e-06, "num_tokens": 937854.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.21940000355243683, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2821272611618042, "rewards/belief_accuracy/std": 0.0, "reward": 1.2254817485809326, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.275255411863327, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3275, "step": 393 }, { "loss": 0.0096, "grad_norm": 0.06492399424314499, "learning_rate": 9.722222222222222e-07, "num_tokens": 940294.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.21359999477863312, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.28939110040664673, "rewards/belief_accuracy/std": 0.0, "reward": 1.2385733127593994, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 10.0, "kl": 0.2405586689710617, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3283333333333333, "step": 394 }, { "loss": 0.0546, "grad_norm": 0.5544055104255676, "learning_rate": 8.333333333333333e-07, "num_tokens": 942734.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.2231999933719635, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3189590871334076, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.3416773080825806, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 1.3655546456575394, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.32916666666666666, "step": 395 }, { "loss": 0.0132, "grad_norm": 1.817050576210022, "learning_rate": 6.944444444444445e-07, "num_tokens": 945038.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": -0.27239999175071716, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.2454039305448532, "rewards/belief_accuracy/std": 0.04660391807556152, "reward": 0.37761184573173523, "reward_std": 0.13981175422668457, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.33038008213043213, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.33, "step": 396 }, { "loss": 0.0312, "grad_norm": 4.133925437927246, "learning_rate": 5.555555555555556e-07, "num_tokens": 946969.0, "completions/mean_length": 9.75, "completions/min_length": 9.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 9.75, "completions/min_terminated_length": 9.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.0878499746322632, "rewards/env_reward/std": 0.48350000381469727, "rewards/belief_accuracy/mean": 0.31071361899375916, "rewards/belief_accuracy/std": 0.0555555634200573, "reward": 2.6139159202575684, "reward_std": 0.7964601516723633, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.7797681391239166, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3308333333333333, "step": 397 }, { "loss": 0.0181, "grad_norm": 4.173895835876465, "learning_rate": 4.1666666666666667e-07, "num_tokens": 949413.0, "completions/mean_length": 11.0, "completions/min_length": 10.0, "completions/max_length": 14.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 11.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 14.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.13279999792575836, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3091141879558563, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.1765426397323608, "reward_std": 0.055555541068315506, "frac_reward_zero_std": 0.0, "completion_length": 14.0, "kl": 0.4529978558421135, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.33166666666666667, "step": 398 }, { "loss": 0.0185, "grad_norm": 0.8653059005737305, "learning_rate": 2.777777777777778e-07, "num_tokens": 951345.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 0.5584999918937683, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.3020426034927368, "rewards/belief_accuracy/std": 0.018518522381782532, "reward": 1.7938777208328247, "reward_std": 0.05555558204650879, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.46213603019714355, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3325, "step": 399 }, { "loss": 0.0215, "grad_norm": 1.37034010887146, "learning_rate": 1.388888888888889e-07, "num_tokens": 953277.0, "completions/mean_length": 10.0, "completions/min_length": 10.0, "completions/max_length": 10.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 10.0, "completions/min_terminated_length": 10.0, "completions/max_terminated_length": 10.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/env_reward/mean": 1.246399998664856, "rewards/env_reward/std": 0.0, "rewards/belief_accuracy/mean": 0.440767377614975, "rewards/belief_accuracy/std": 0.020541606470942497, "reward": 3.2419023513793945, "reward_std": 0.06162475422024727, "frac_reward_zero_std": 0.0, "completion_length": 10.0, "kl": 0.5379725992679596, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3333333333333333, "step": 400 }, { "train_runtime": 1554.6603, "train_samples_per_second": 1.029, "train_steps_per_second": 0.257, "total_flos": 0.0, "train_loss": 0.024693291002186014, "epoch": 0.3333333333333333, "step": 400 } ]