{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0005, "frac_reward_zero_std": 1.0, "grad_norm": 0.32243695855140686, "kl": 0.016345822252333164, "learning_rate": 0.0, "loss": 0.0007, "num_tokens": 2516.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 23.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.001, "frac_reward_zero_std": 1.0, "grad_norm": 0.1738930642604828, "kl": 0.0056577762588858604, "learning_rate": 1.0000000000000002e-06, "loss": 0.0002, "num_tokens": 5035.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0015, "frac_reward_zero_std": 0.0, "grad_norm": 53.453521728515625, "kl": 0.027107596397399902, "learning_rate": 2.0000000000000003e-06, "loss": 0.0011, "num_tokens": 7545.0, "reward": -3.724677085876465, "reward_std": 2.4506454467773438, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.212284803390503, "rewards/env_reward/std": 1.5754303932189941, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.002, "frac_reward_zero_std": 1.0, "grad_norm": 1.2984156608581543, "kl": 0.013630361296236515, "learning_rate": 3e-06, "loss": 0.0005, "num_tokens": 10063.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0025, "frac_reward_zero_std": 0.0, "grad_norm": 61.420711517333984, "kl": 0.0825746851041913, "learning_rate": 4.000000000000001e-06, "loss": 0.0033, "num_tokens": 12536.0, "reward": -3.895512342453003, "reward_std": 2.1089749336242676, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.3261749744415283, "rewards/env_reward/std": 1.3476500511169434, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.33333396911621, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.003, "frac_reward_zero_std": 0.0, "grad_norm": 11.032736778259277, "kl": 0.0035573970526456833, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 15059.0, "reward": -2.084261417388916, "reward_std": 3.3090696334838867, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.147840976715088, "rewards/env_reward/std": 2.1386890411376953, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.0035, "frac_reward_zero_std": 1.0, "grad_norm": 0.20612499117851257, "kl": 0.007132542319595814, "learning_rate": 6e-06, "loss": 0.0003, "num_tokens": 17568.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.004, "frac_reward_zero_std": 0.0, "grad_norm": 10.10438346862793, "kl": 0.05212839285377413, "learning_rate": 7.000000000000001e-06, "loss": 0.0021, "num_tokens": 20049.0, "reward": -3.4786999225616455, "reward_std": 2.9425997734069824, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.04830002784729, "rewards/env_reward/std": 1.90339994430542, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0045, "frac_reward_zero_std": 1.0, "grad_norm": 0.36320337653160095, "kl": 0.005910599138587713, "learning_rate": 8.000000000000001e-06, "loss": 0.0002, "num_tokens": 22569.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.005, "frac_reward_zero_std": 1.0, "grad_norm": 35.37952423095703, "kl": 0.214208863559179, "learning_rate": 9e-06, "loss": 0.0086, "num_tokens": 25052.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0055, "frac_reward_zero_std": 0.0, "grad_norm": 219.09710693359375, "kl": 0.09427966503426433, "learning_rate": 1e-05, "loss": 0.0038, "num_tokens": 27536.0, "reward": -2.487870216369629, "reward_std": 2.853968858718872, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.4169135093688965, "rewards/env_reward/std": 1.8355563879013062, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.006, "frac_reward_zero_std": 1.0, "grad_norm": 5.1641130447387695, "kl": 0.02741223480552435, "learning_rate": 1.1000000000000001e-05, "loss": 0.0011, "num_tokens": 30040.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0065, "frac_reward_zero_std": 0.0, "grad_norm": 18.722599029541016, "kl": 0.16925985834677704, "learning_rate": 1.2e-05, "loss": 0.0068, "num_tokens": 32510.0, "reward": -2.37943172454834, "reward_std": 2.9682364463806152, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.3446213006973267, "rewards/env_reward/std": 1.9114667177200317, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.007, "frac_reward_zero_std": 1.0, "grad_norm": 0.1022278442978859, "kl": 0.006297597661614418, "learning_rate": 1.3000000000000001e-05, "loss": 0.0003, "num_tokens": 35028.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.0075, "frac_reward_zero_std": 1.0, "grad_norm": 0.032320525497198105, "kl": 0.002568609546869993, "learning_rate": 1.4000000000000001e-05, "loss": 0.0001, "num_tokens": 37556.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 22.33333396911621, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.008, "frac_reward_zero_std": 1.0, "grad_norm": 8.214491844177246, "kl": 0.041143732611089945, "learning_rate": 1.5e-05, "loss": 0.0016, "num_tokens": 40055.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0085, "frac_reward_zero_std": 0.0, "grad_norm": 39.8804817199707, "kl": 0.04003936113440432, "learning_rate": 1.6000000000000003e-05, "loss": 0.0016, "num_tokens": 42531.0, "reward": -2.5680184364318848, "reward_std": 2.750802993774414, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.11806440353393555, "rewards/belief_accuracy/std": 0.03612881526350975, "rewards/env_reward/mean": -1.4383834600448608, "rewards/env_reward/std": 1.8049958944320679, "rewards/format_valid/mean": -0.625, "rewards/format_valid/std": 1.6007810831069946, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.009, "frac_reward_zero_std": 1.0, "grad_norm": 1.495725393295288, "kl": 0.019086187705397606, "learning_rate": 1.7000000000000003e-05, "loss": 0.0008, "num_tokens": 45044.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0095, "frac_reward_zero_std": 1.0, "grad_norm": 26.531925201416016, "kl": 0.09912175685167313, "learning_rate": 1.8e-05, "loss": 0.004, "num_tokens": 47535.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.01, "frac_reward_zero_std": 0.0, "grad_norm": 8.011573791503906, "kl": 0.0038544870913028717, "learning_rate": 1.9e-05, "loss": 0.0002, "num_tokens": 50063.0, "reward": -2.320432662963867, "reward_std": 3.037968873977661, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.305288553237915, "rewards/env_reward/std": 1.9579919576644897, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.0105, "frac_reward_zero_std": 1.0, "grad_norm": 36.5081787109375, "kl": 0.2546631218865514, "learning_rate": 2e-05, "loss": 0.0102, "num_tokens": 52561.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.011, "frac_reward_zero_std": 1.0, "grad_norm": 1.0799552202224731, "kl": 0.009861491620540619, "learning_rate": 2.1e-05, "loss": 0.0004, "num_tokens": 55083.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 20.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.0115, "frac_reward_zero_std": 1.0, "grad_norm": 23.75230598449707, "kl": 0.20189414219930768, "learning_rate": 2.2000000000000003e-05, "loss": 0.0081, "num_tokens": 57588.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.012, "frac_reward_zero_std": 0.0, "grad_norm": 102.0042953491211, "kl": 0.1681511290371418, "learning_rate": 2.3000000000000003e-05, "loss": 0.0067, "num_tokens": 60080.0, "reward": -1.766066074371338, "reward_std": 2.126420736312866, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.9648774862289429, "rewards/env_reward/std": 1.3593891859054565, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 13.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0125, "frac_reward_zero_std": 1.0, "grad_norm": 2.4423506259918213, "kl": 0.0637103128246963, "learning_rate": 2.4e-05, "loss": 0.0025, "num_tokens": 62571.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013, "frac_reward_zero_std": 1.0, "grad_norm": 0.05712759494781494, "kl": 0.005990173202008009, "learning_rate": 2.5e-05, "loss": 0.0002, "num_tokens": 65098.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0135, "frac_reward_zero_std": 1.0, "grad_norm": 0.1384836584329605, "kl": 0.018408390693366528, "learning_rate": 2.6000000000000002e-05, "loss": 0.0007, "num_tokens": 67611.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.014, "frac_reward_zero_std": 1.0, "grad_norm": 0.17841196060180664, "kl": 0.008233492728322744, "learning_rate": 2.7000000000000002e-05, "loss": 0.0003, "num_tokens": 70139.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0145, "frac_reward_zero_std": 1.0, "grad_norm": 14.524484634399414, "kl": 0.07956769224256277, "learning_rate": 2.8000000000000003e-05, "loss": 0.0032, "num_tokens": 72647.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.015, "frac_reward_zero_std": 1.0, "grad_norm": 0.6826711297035217, "kl": 0.05026988545432687, "learning_rate": 2.9e-05, "loss": 0.002, "num_tokens": 75144.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 14.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0155, "frac_reward_zero_std": 0.0, "grad_norm": 11050.3515625, "kl": 36.80695866746828, "learning_rate": 3e-05, "loss": 1.4723, "num_tokens": 77637.0, "reward": -2.443718194961548, "reward_std": 2.9238996505737305, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.0741666704416275, "rewards/belief_accuracy/std": 0.05166666582226753, "rewards/env_reward/mean": -1.443312168121338, "rewards/env_reward/std": 1.8071939945220947, "rewards/format_valid/mean": -0.625, "rewards/format_valid/std": 1.6007810831069946, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 14.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.016, "frac_reward_zero_std": 1.0, "grad_norm": 5.319654941558838, "kl": 0.1096202852204442, "learning_rate": 3.1e-05, "loss": 0.0044, "num_tokens": 80130.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.0165, "frac_reward_zero_std": 0.0, "grad_norm": 28.577579498291016, "kl": 0.04399943072348833, "learning_rate": 3.2000000000000005e-05, "loss": 0.0018, "num_tokens": 82627.0, "reward": -3.7981131076812744, "reward_std": 2.3037734031677246, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.261242151260376, "rewards/env_reward/std": 1.477515697479248, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 16.666667938232422, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.017, "frac_reward_zero_std": 1.0, "grad_norm": 3.7505602836608887, "kl": 0.04482424072921276, "learning_rate": 3.3e-05, "loss": 0.0018, "num_tokens": 85109.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0175, "frac_reward_zero_std": 0.0, "grad_norm": 20.112499237060547, "kl": 0.0021229138001217507, "learning_rate": 3.4000000000000007e-05, "loss": 0.0001, "num_tokens": 87574.0, "reward": 0.1572304666042328, "reward_std": 0.04570581018924713, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.014735294505953789, "rewards/belief_accuracy/std": 0.09565715491771698, "rewards/env_reward/mean": 0.10095755755901337, "rewards/env_reward/std": 0.20054571330547333, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 17.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.018, "frac_reward_zero_std": 1.0, "grad_norm": 0.24909250438213348, "kl": 0.024185666348785162, "learning_rate": 3.5e-05, "loss": 0.001, "num_tokens": 90072.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 27.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0185, "frac_reward_zero_std": 0.0, "grad_norm": 11.509799003601074, "kl": 0.01711271144449711, "learning_rate": 3.6e-05, "loss": 0.0007, "num_tokens": 92595.0, "reward": -3.6846251487731934, "reward_std": 2.530749559402466, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.1855833530426025, "rewards/env_reward/std": 1.6288331747055054, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.019, "frac_reward_zero_std": 1.0, "grad_norm": 0.46738389134407043, "kl": 0.012128827278502285, "learning_rate": 3.7e-05, "loss": 0.0005, "num_tokens": 95085.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 26.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.0195, "frac_reward_zero_std": 1.0, "grad_norm": 2.194053888320923, "kl": 0.039654724299907684, "learning_rate": 3.8e-05, "loss": 0.0016, "num_tokens": 97595.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 22.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.02, "frac_reward_zero_std": 1.0, "grad_norm": 0.2551957964897156, "kl": 0.02670608414337039, "learning_rate": 3.9000000000000006e-05, "loss": 0.0011, "num_tokens": 100093.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0205, "frac_reward_zero_std": 1.0, "grad_norm": 0.4345109760761261, "kl": 0.010095613077282906, "learning_rate": 4e-05, "loss": 0.0004, "num_tokens": 102612.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 18.33333396911621, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.021, "frac_reward_zero_std": 1.0, "grad_norm": 0.6849669218063354, "kl": 0.08905280428007245, "learning_rate": 4.1e-05, "loss": 0.0036, "num_tokens": 105099.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0215, "frac_reward_zero_std": 0.0, "grad_norm": 113.153564453125, "kl": 0.12807448720559478, "learning_rate": 4.2e-05, "loss": 0.0051, "num_tokens": 107595.0, "reward": -2.9136834144592285, "reward_std": 2.423197031021118, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.700788974761963, "rewards/env_reward/std": 1.5501903295516968, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.022, "frac_reward_zero_std": 0.0, "grad_norm": 3.1417529582977295, "kl": 0.05178070580586791, "learning_rate": 4.3e-05, "loss": 0.0021, "num_tokens": 110123.0, "reward": -3.766486167907715, "reward_std": 2.3670270442962646, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.2401576042175293, "rewards/env_reward/std": 1.5196847915649414, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 14.666666984558105, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0225, "frac_reward_zero_std": 0.0, "grad_norm": 31.787551879882812, "kl": 0.1364445798099041, "learning_rate": 4.4000000000000006e-05, "loss": 0.0055, "num_tokens": 112599.0, "reward": -2.7055277824401855, "reward_std": 2.6139395236968994, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.12398147583007812, "rewards/belief_accuracy/std": 0.04796295985579491, "rewards/env_reward/mean": -1.5182223320007324, "rewards/env_reward/std": 1.736833095550537, "rewards/format_valid/mean": -0.625, "rewards/format_valid/std": 1.6007810831069946, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 20.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.023, "frac_reward_zero_std": 0.0, "grad_norm": 809.3003540039062, "kl": 1.457309697754681, "learning_rate": 4.5e-05, "loss": 0.0583, "num_tokens": 115104.0, "reward": -2.267341136932373, "reward_std": 3.0976674556732178, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.2698941230773926, "rewards/env_reward/std": 1.9977540969848633, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 14.666666984558105, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0235, "frac_reward_zero_std": 1.0, "grad_norm": 17.53619384765625, "kl": 1.0537898712791502, "learning_rate": 4.600000000000001e-05, "loss": 0.0422, "num_tokens": 117580.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 14.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.024, "frac_reward_zero_std": 0.0, "grad_norm": 187.8842010498047, "kl": 0.3121867855079472, "learning_rate": 4.7e-05, "loss": 0.0125, "num_tokens": 120073.0, "reward": -3.6274335384368896, "reward_std": 2.645132541656494, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.1474556922912598, "rewards/env_reward/std": 1.705088496208191, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0245, "frac_reward_zero_std": 0.0, "grad_norm": 18.377052307128906, "kl": 0.10642453748732805, "learning_rate": 4.8e-05, "loss": 0.0043, "num_tokens": 122588.0, "reward": -3.7050957679748535, "reward_std": 2.4898080825805664, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.199230432510376, "rewards/env_reward/std": 1.6015390157699585, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 27.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.025, "frac_reward_zero_std": 0.0, "grad_norm": 8.019148826599121, "kl": 0.056871576234698296, "learning_rate": 4.9e-05, "loss": 0.0023, "num_tokens": 125111.0, "reward": -3.6691508293151855, "reward_std": 2.5616979598999023, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.175267219543457, "rewards/env_reward/std": 1.6494653224945068, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 26.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0255, "frac_reward_zero_std": 0.0, "grad_norm": 13.499008178710938, "kl": 0.0715335039421916, "learning_rate": 5e-05, "loss": 0.0029, "num_tokens": 127628.0, "reward": -2.420839786529541, "reward_std": 2.920422315597534, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.372226595878601, "rewards/env_reward/std": 1.8795907497406006, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.026, "frac_reward_zero_std": 0.0, "grad_norm": 5.560655117034912, "kl": 0.017654206603765488, "learning_rate": 4.9888888888888894e-05, "loss": 0.0007, "num_tokens": 130156.0, "reward": -4.051011085510254, "reward_std": 1.7979769706726074, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.4298410415649414, "rewards/env_reward/std": 1.1403180360794067, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.0265, "frac_reward_zero_std": 0.0, "grad_norm": 18.195871353149414, "kl": 0.24096931191161275, "learning_rate": 4.977777777777778e-05, "loss": 0.0096, "num_tokens": 132651.0, "reward": -2.6885905265808105, "reward_std": 2.6208035945892334, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.5507268905639648, "rewards/env_reward/std": 1.6801002025604248, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 17.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.027, "frac_reward_zero_std": 1.0, "grad_norm": 123.76203155517578, "kl": 5.633732934948057, "learning_rate": 4.966666666666667e-05, "loss": 0.2253, "num_tokens": 135150.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 27.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0275, "frac_reward_zero_std": 0.0, "grad_norm": 11.095720291137695, "kl": 0.04261765070259571, "learning_rate": 4.955555555555556e-05, "loss": 0.0017, "num_tokens": 137669.0, "reward": -3.5939033031463623, "reward_std": 2.712193012237549, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.1251022815704346, "rewards/env_reward/std": 1.7497954368591309, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.028, "frac_reward_zero_std": 0.0, "grad_norm": 7.206400394439697, "kl": 0.0018149956013076007, "learning_rate": 4.9444444444444446e-05, "loss": 0.0001, "num_tokens": 140095.0, "reward": -0.6511554718017578, "reward_std": 0.4664153754711151, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.2507702708244324, "rewards/env_reward/std": 0.3109435439109802, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 17.666667938232422, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0285, "frac_reward_zero_std": 0.0, "grad_norm": 9.72509479522705, "kl": 0.03556834487244487, "learning_rate": 4.933333333333334e-05, "loss": 0.0014, "num_tokens": 142580.0, "reward": -1.1361982822418213, "reward_std": 2.5431196689605713, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5449655055999756, "rewards/env_reward/std": 1.6370937824249268, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.029, "frac_reward_zero_std": 0.0, "grad_norm": 69.71238708496094, "kl": 0.12905889004468918, "learning_rate": 4.922222222222222e-05, "loss": 0.0052, "num_tokens": 145053.0, "reward": -3.697530746459961, "reward_std": 2.5049378871917725, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.1941874027252197, "rewards/env_reward/std": 1.61162531375885, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0295, "frac_reward_zero_std": 0.0, "grad_norm": 4.306075572967529, "kl": 0.05293075350346044, "learning_rate": 4.9111111111111114e-05, "loss": 0.0021, "num_tokens": 147515.0, "reward": -0.8994538187980652, "reward_std": 0.14630256593227386, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.165370374917984, "rewards/belief_accuracy/std": 0.04776628687977791, "rewards/env_reward/mean": -0.30222848057746887, "rewards/env_reward/std": 0.13543139398097992, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.03, "frac_reward_zero_std": 0.0, "grad_norm": 5.006049633026123, "kl": 0.024995889314595843, "learning_rate": 4.9e-05, "loss": 0.001, "num_tokens": 149950.0, "reward": -0.2717297375202179, "reward_std": 0.27656516432762146, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.03703703731298447, "rewards/belief_accuracy/std": 0.04781460762023926, "rewards/env_reward/mean": -0.14041242003440857, "rewards/env_reward/std": 0.21106119453907013, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0305, "frac_reward_zero_std": 1.0, "grad_norm": 1.1231383085250854, "kl": 0.25201990082859993, "learning_rate": 4.888888888888889e-05, "loss": 0.0101, "num_tokens": 152433.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.031, "frac_reward_zero_std": 0.0, "grad_norm": 14.847359657287598, "kl": 0.3126356555148959, "learning_rate": 4.8777777777777775e-05, "loss": 0.0125, "num_tokens": 154901.0, "reward": -1.7808257341384888, "reward_std": 3.659447193145752, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.9455505609512329, "rewards/env_reward/std": 2.372274160385132, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0315, "frac_reward_zero_std": 0.0, "grad_norm": 11.686541557312012, "kl": 0.153579062782228, "learning_rate": 4.866666666666667e-05, "loss": 0.0061, "num_tokens": 157399.0, "reward": -2.4691736698150635, "reward_std": 2.869215488433838, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.404449224472046, "rewards/env_reward/std": 1.845564842224121, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 20.33333396911621, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.032, "frac_reward_zero_std": 0.0, "grad_norm": 92.96546936035156, "kl": 0.2878706678748131, "learning_rate": 4.855555555555556e-05, "loss": 0.0115, "num_tokens": 159892.0, "reward": -3.7632439136505127, "reward_std": 2.373511791229248, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.2379961013793945, "rewards/env_reward/std": 1.5240079164505005, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 26.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.0325, "frac_reward_zero_std": 0.0, "grad_norm": 36.191368103027344, "kl": 0.35182441864162683, "learning_rate": 4.844444444444445e-05, "loss": 0.0141, "num_tokens": 162402.0, "reward": -2.2611498832702637, "reward_std": 3.114436149597168, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10736748576164246, "rewards/belief_accuracy/std": 0.014734972268342972, "rewards/env_reward/mean": -1.255198359489441, "rewards/env_reward/std": 2.0199925899505615, "rewards/format_valid/mean": -0.625, "rewards/format_valid/std": 1.6007810831069946, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 11.666666984558105, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.033, "frac_reward_zero_std": 0.0, "grad_norm": 228884.828125, "kl": 468.09647609852254, "learning_rate": 4.8333333333333334e-05, "loss": 18.7239, "num_tokens": 164869.0, "reward": -3.877704381942749, "reward_std": 2.1445908546447754, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.314302921295166, "rewards/env_reward/std": 1.3713939189910889, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.0335, "frac_reward_zero_std": 0.0, "grad_norm": 4.94432258605957, "kl": 0.21442949026823044, "learning_rate": 4.8222222222222225e-05, "loss": 0.0086, "num_tokens": 167387.0, "reward": -3.75144624710083, "reward_std": 2.397106885910034, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.230130910873413, "rewards/env_reward/std": 1.5397380590438843, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 8.333333969116211, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.034, "frac_reward_zero_std": 0.0, "grad_norm": 10.818193435668945, "kl": 0.9112066635861993, "learning_rate": 4.811111111111111e-05, "loss": 0.0364, "num_tokens": 169620.0, "reward": 0.4229079484939575, "reward_std": 0.2314292937517166, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.24860529601573944, "rewards/env_reward/std": 0.154286190867424, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 13.333333969116211, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0345, "frac_reward_zero_std": 0.0, "grad_norm": 25.480619430541992, "kl": 2.5357193499803543, "learning_rate": 4.8e-05, "loss": 0.1014, "num_tokens": 172092.0, "reward": -2.201890468597412, "reward_std": 3.173243284225464, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.2262604236602783, "rewards/env_reward/std": 2.04813814163208, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 22.666667938232422, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.035, "frac_reward_zero_std": 0.0, "grad_norm": 8.468518257141113, "kl": 0.5803861692547798, "learning_rate": 4.7888888888888886e-05, "loss": 0.0232, "num_tokens": 174592.0, "reward": -2.594465732574463, "reward_std": 2.7201738357543945, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.0878773033618927, "rewards/belief_accuracy/std": 0.024245386943221092, "rewards/env_reward/mean": -1.5163891315460205, "rewards/env_reward/std": 1.7145698070526123, "rewards/format_valid/mean": -0.625, "rewards/format_valid/std": 1.6007810831069946, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 22.666667938232422, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0355, "frac_reward_zero_std": 0.0, "grad_norm": 46.88726806640625, "kl": 0.7163544222712517, "learning_rate": 4.7777777777777784e-05, "loss": 0.0287, "num_tokens": 177092.0, "reward": -2.0240089893341064, "reward_std": 3.3790602684020996, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.1076725721359253, "rewards/env_reward/std": 2.1853580474853516, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 19.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.036, "frac_reward_zero_std": 0.0, "grad_norm": 4.72477388381958, "kl": 0.7021452663466334, "learning_rate": 4.766666666666667e-05, "loss": 0.0281, "num_tokens": 179581.0, "reward": -1.4259536266326904, "reward_std": 2.3681116104125977, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.7381357550621033, "rewards/env_reward/std": 1.5208872556686401, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0365, "frac_reward_zero_std": 0.0, "grad_norm": 26.196685791015625, "kl": 0.6366847828030586, "learning_rate": 4.755555555555556e-05, "loss": 0.0255, "num_tokens": 182076.0, "reward": -1.664202332496643, "reward_std": 2.311755418777466, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.08500000089406967, "rewards/belief_accuracy/std": 0.030000001192092896, "rewards/env_reward/mean": -0.9311348795890808, "rewards/env_reward/std": 1.4874457120895386, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.037, "frac_reward_zero_std": 0.0, "grad_norm": 38.14860916137695, "kl": 1.0752212293446064, "learning_rate": 4.7444444444444445e-05, "loss": 0.043, "num_tokens": 184544.0, "reward": -1.49713134765625, "reward_std": 2.301912307739258, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.7855876684188843, "rewards/env_reward/std": 1.476274847984314, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.0375, "frac_reward_zero_std": 0.0, "grad_norm": 3.1271560192108154, "kl": 0.22124752588570118, "learning_rate": 4.7333333333333336e-05, "loss": 0.0088, "num_tokens": 187072.0, "reward": -2.7508177757263184, "reward_std": 2.5399067401885986, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.5922119617462158, "rewards/env_reward/std": 1.6259276866912842, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.038, "frac_reward_zero_std": 0.0, "grad_norm": 5.755521774291992, "kl": 0.16726691462099552, "learning_rate": 4.722222222222222e-05, "loss": 0.0067, "num_tokens": 189600.0, "reward": -3.8879446983337402, "reward_std": 2.124109983444214, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.3211300373077393, "rewards/env_reward/std": 1.357740044593811, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.0385, "frac_reward_zero_std": 1.0, "grad_norm": 0.24942930042743683, "kl": 0.789710771292448, "learning_rate": 4.711111111111111e-05, "loss": 0.0316, "num_tokens": 192108.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 11.666666984558105, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.039, "frac_reward_zero_std": 0.0, "grad_norm": 4.9244184494018555, "kl": 1.1887651532888412, "learning_rate": 4.7e-05, "loss": 0.0476, "num_tokens": 194575.0, "reward": -1.6062259674072266, "reward_std": 2.2595274448394775, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.08573612570762634, "rewards/belief_accuracy/std": 0.028527740389108658, "rewards/env_reward/mean": -0.8910117149353027, "rewards/env_reward/std": 1.4291024208068848, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 21.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0395, "frac_reward_zero_std": 0.0, "grad_norm": 4.589632034301758, "kl": 0.914489395916462, "learning_rate": 4.6888888888888895e-05, "loss": 0.0366, "num_tokens": 197081.0, "reward": -1.3840163946151733, "reward_std": 2.4131362438201904, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": -0.681010901927948, "rewards/env_reward/std": 1.562245488166809, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 19.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 23.94391441345215, "kl": 0.7511968985199928, "learning_rate": 4.677777777777778e-05, "loss": 0.03, "num_tokens": 199584.0, "reward": -1.7008922100067139, "reward_std": 2.489635705947876, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.9214280843734741, "rewards/env_reward/std": 1.6092621088027954, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0405, "frac_reward_zero_std": 0.0, "grad_norm": 14.980483055114746, "kl": 1.2917132005095482, "learning_rate": 4.666666666666667e-05, "loss": 0.0517, "num_tokens": 202050.0, "reward": -1.1099207401275635, "reward_std": 2.573901891708374, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5274472236633301, "rewards/env_reward/std": 1.6579262018203735, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.041, "frac_reward_zero_std": 0.0, "grad_norm": 18.656517028808594, "kl": 1.1007941216230392, "learning_rate": 4.6555555555555556e-05, "loss": 0.044, "num_tokens": 204526.0, "reward": -0.9679015278816223, "reward_std": 2.6547322273254395, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.43276768922805786, "rewards/env_reward/std": 1.7114882469177246, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 15.666666984558105, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0415, "frac_reward_zero_std": 0.0, "grad_norm": 6.2242937088012695, "kl": 1.004544973373413, "learning_rate": 4.644444444444445e-05, "loss": 0.0402, "num_tokens": 207005.0, "reward": -2.1635308265686035, "reward_std": 3.2175371646881104, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.200687289237976, "rewards/env_reward/std": 2.077667474746704, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.042, "frac_reward_zero_std": 0.0, "grad_norm": 14.178193092346191, "kl": 0.9078696174547076, "learning_rate": 4.633333333333333e-05, "loss": 0.0363, "num_tokens": 209475.0, "reward": 0.19402220845222473, "reward_std": 0.2724432051181793, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.00833333283662796, "rewards/belief_accuracy/std": 0.10671874135732651, "rewards/env_reward/mean": 0.08351479470729828, "rewards/env_reward/std": 0.14911670982837677, "rewards/format_valid/mean": 0.875, "rewards/format_valid/std": 0.25, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.0425, "frac_reward_zero_std": 0.0, "grad_norm": 7.778254508972168, "kl": 1.0725902691483498, "learning_rate": 4.6222222222222224e-05, "loss": 0.0429, "num_tokens": 211956.0, "reward": -0.8444531559944153, "reward_std": 2.7435097694396973, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.0925000011920929, "rewards/belief_accuracy/std": 0.015000000596046448, "rewards/env_reward/mean": -0.3696354031562805, "rewards/env_reward/std": 1.7566334009170532, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.043, "frac_reward_zero_std": 0.0, "grad_norm": 4.498073101043701, "kl": 0.7945144101977348, "learning_rate": 4.6111111111111115e-05, "loss": 0.0318, "num_tokens": 214458.0, "reward": -0.4258846640586853, "reward_std": 0.525246798992157, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.07500000298023224, "rewards/belief_accuracy/std": 0.05000000074505806, "rewards/env_reward/mean": -0.15475642681121826, "rewards/env_reward/std": 0.276262104511261, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 15.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0435, "frac_reward_zero_std": 0.0, "grad_norm": 5.651465892791748, "kl": 1.144854974001646, "learning_rate": 4.600000000000001e-05, "loss": 0.0458, "num_tokens": 216952.0, "reward": -2.7960398197174072, "reward_std": 2.4879508018493652, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10818149149417877, "rewards/belief_accuracy/std": 0.016362976282835007, "rewards/env_reward/mean": -1.610163688659668, "rewards/env_reward/std": 1.6060127019882202, "rewards/format_valid/mean": -0.625, "rewards/format_valid/std": 1.6007810831069946, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 17.666667938232422, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.044, "frac_reward_zero_std": 0.0, "grad_norm": 4.523915767669678, "kl": 1.0343455001711845, "learning_rate": 4.588888888888889e-05, "loss": 0.0414, "num_tokens": 219437.0, "reward": -2.3472089767456055, "reward_std": 3.005443811416626, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.3231394290924072, "rewards/env_reward/std": 1.9362717866897583, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0445, "frac_reward_zero_std": 0.0, "grad_norm": 3.352187395095825, "kl": 1.272004920989275, "learning_rate": 4.577777777777778e-05, "loss": 0.0509, "num_tokens": 221904.0, "reward": -1.3286750316619873, "reward_std": 2.4145350456237793, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": -0.6441167593002319, "rewards/env_reward/std": 1.5708537101745605, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.045, "frac_reward_zero_std": 0.0, "grad_norm": 3.992652654647827, "kl": 1.1837435215711594, "learning_rate": 4.566666666666667e-05, "loss": 0.0473, "num_tokens": 224393.0, "reward": -0.807397723197937, "reward_std": 2.8079702854156494, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.3299318850040436, "rewards/env_reward/std": 1.8133907318115234, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0455, "frac_reward_zero_std": 0.0, "grad_norm": 5.360654354095459, "kl": 1.3510248363018036, "learning_rate": 4.555555555555556e-05, "loss": 0.054, "num_tokens": 226870.0, "reward": 0.35237032175064087, "reward_std": 1.0852247476577759, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.418246865272522, "rewards/env_reward/std": 0.723483145236969, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 19.666667938232422, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.046, "frac_reward_zero_std": 0.0, "grad_norm": 3.634124517440796, "kl": 0.8390218988060951, "learning_rate": 4.5444444444444444e-05, "loss": 0.0336, "num_tokens": 229361.0, "reward": -2.2453417778015137, "reward_std": 3.167144298553467, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.25522780418396, "rewards/env_reward/std": 2.0450401306152344, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0465, "frac_reward_zero_std": 1.0, "grad_norm": 0.345480352640152, "kl": 1.7472785264253616, "learning_rate": 4.5333333333333335e-05, "loss": 0.0699, "num_tokens": 231818.0, "reward": 0.20606237649917603, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3207082748413086, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.047, "frac_reward_zero_std": 0.0, "grad_norm": 4.007046222686768, "kl": 1.7106561437249184, "learning_rate": 4.522222222222223e-05, "loss": 0.0684, "num_tokens": 234305.0, "reward": -1.3136588335037231, "reward_std": 2.4297609329223633, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6632725596427917, "rewards/env_reward/std": 1.5616451501846313, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.0475, "frac_reward_zero_std": 0.0, "grad_norm": 6.215799808502197, "kl": 2.4182121604681015, "learning_rate": 4.511111111111112e-05, "loss": 0.0967, "num_tokens": 236746.0, "reward": -1.4073553085327148, "reward_std": 2.4502437114715576, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.7257369160652161, "rewards/env_reward/std": 1.5773454904556274, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.048, "frac_reward_zero_std": 0.0, "grad_norm": 2.8741865158081055, "kl": 1.375985711812973, "learning_rate": 4.5e-05, "loss": 0.055, "num_tokens": 239220.0, "reward": -1.8366073369979858, "reward_std": 2.075605630874634, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.0119048357009888, "rewards/env_reward/std": 1.325404167175293, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 12.666666984558105, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0485, "frac_reward_zero_std": 0.0, "grad_norm": 2.7931768894195557, "kl": 1.1252032294869423, "learning_rate": 4.4888888888888894e-05, "loss": 0.045, "num_tokens": 241690.0, "reward": -0.21447324752807617, "reward_std": 0.08092716336250305, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.040351178497076035, "rewards/env_reward/std": 0.053951445966959, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.049, "frac_reward_zero_std": 0.0, "grad_norm": 3.6995668411254883, "kl": 0.2477953266352415, "learning_rate": 4.477777777777778e-05, "loss": 0.0099, "num_tokens": 243898.0, "reward": 0.10004599392414093, "reward_std": 0.12990380823612213, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.03336399421095848, "rewards/env_reward/std": 0.08660253882408142, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0495, "frac_reward_zero_std": 0.0, "grad_norm": 3.9689114093780518, "kl": 1.3716598898172379, "learning_rate": 4.466666666666667e-05, "loss": 0.0549, "num_tokens": 246394.0, "reward": 0.05885888263583183, "reward_std": 0.17086723446846008, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11083333194255829, "rewards/belief_accuracy/std": 0.021666666492819786, "rewards/env_reward/mean": 0.2400725781917572, "rewards/env_reward/std": 0.08001596480607986, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.05, "frac_reward_zero_std": 0.0, "grad_norm": 6.792644500732422, "kl": 1.8348833322525024, "learning_rate": 4.4555555555555555e-05, "loss": 0.0734, "num_tokens": 248839.0, "reward": 0.2527257204055786, "reward_std": 0.15090236067771912, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.35181713104248047, "rewards/env_reward/std": 0.10060158371925354, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0505, "frac_reward_zero_std": 0.0, "grad_norm": 2.8620173931121826, "kl": 1.4036446511745453, "learning_rate": 4.4444444444444447e-05, "loss": 0.0561, "num_tokens": 251307.0, "reward": -1.31059730052948, "reward_std": 2.4275147914886475, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.1158333271741867, "rewards/belief_accuracy/std": 0.03166666254401207, "rewards/env_reward/mean": -0.6337315440177917, "rewards/env_reward/std": 1.577512264251709, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.051, "frac_reward_zero_std": 0.0, "grad_norm": 2.1169137954711914, "kl": 1.8475644141435623, "learning_rate": 4.433333333333334e-05, "loss": 0.0739, "num_tokens": 253767.0, "reward": -1.3670077323913574, "reward_std": 2.399441719055176, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6988385319709778, "rewards/env_reward/std": 1.5415664911270142, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0515, "frac_reward_zero_std": 0.0, "grad_norm": 3.684938430786133, "kl": 2.059985037893057, "learning_rate": 4.422222222222222e-05, "loss": 0.0824, "num_tokens": 256198.0, "reward": -1.4205896854400635, "reward_std": 2.3623111248016357, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.7345598340034485, "rewards/env_reward/std": 1.516780972480774, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.052, "frac_reward_zero_std": 0.0, "grad_norm": 3.5143520832061768, "kl": 1.3098777011036873, "learning_rate": 4.4111111111111114e-05, "loss": 0.0524, "num_tokens": 258632.0, "reward": -0.0835796445608139, "reward_std": 0.2586938738822937, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.0363982617855072, "rewards/belief_accuracy/std": 0.11159241199493408, "rewards/env_reward/mean": -0.016256578266620636, "rewards/env_reward/std": 0.26623615622520447, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0525, "frac_reward_zero_std": 0.0, "grad_norm": 6.2300004959106445, "kl": 1.785375103354454, "learning_rate": 4.4000000000000006e-05, "loss": 0.0714, "num_tokens": 261090.0, "reward": -0.08198876678943634, "reward_std": 0.8859658241271973, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.12867416441440582, "rewards/env_reward/std": 0.5906438827514648, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.053, "frac_reward_zero_std": 0.0, "grad_norm": 2.8233327865600586, "kl": 1.1763433814048767, "learning_rate": 4.388888888888889e-05, "loss": 0.0471, "num_tokens": 263555.0, "reward": -2.6301207542419434, "reward_std": 2.697817087173462, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.511747121810913, "rewards/env_reward/std": 1.7316814661026, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0535, "frac_reward_zero_std": 0.0, "grad_norm": 5.215397357940674, "kl": 1.5616333931684494, "learning_rate": 4.377777777777778e-05, "loss": 0.0625, "num_tokens": 266021.0, "reward": -0.27485907077789307, "reward_std": 0.8942175507545471, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 9.395182132720947e-05, "rewards/env_reward/std": 0.5961450934410095, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.054, "frac_reward_zero_std": 0.0, "grad_norm": 4.469447612762451, "kl": 2.5851728469133377, "learning_rate": 4.3666666666666666e-05, "loss": 0.1034, "num_tokens": 268461.0, "reward": -1.1452138423919678, "reward_std": 2.582923650741577, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5509759187698364, "rewards/env_reward/std": 1.6647002696990967, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0545, "frac_reward_zero_std": 0.0, "grad_norm": 7.712915420532227, "kl": 2.6475657522678375, "learning_rate": 4.355555555555556e-05, "loss": 0.1059, "num_tokens": 270898.0, "reward": 0.12694786489009857, "reward_std": 0.002898484468460083, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.26796525716781616, "rewards/env_reward/std": 0.0019323229789733887, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.055, "frac_reward_zero_std": 0.0, "grad_norm": 7.97365140914917, "kl": 2.896424412727356, "learning_rate": 4.344444444444445e-05, "loss": 0.1159, "num_tokens": 273372.0, "reward": -0.7330765128135681, "reward_std": 0.20495304465293884, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.06111111119389534, "rewards/belief_accuracy/std": 0.07777778059244156, "rewards/env_reward/mean": -0.38732877373695374, "rewards/env_reward/std": 0.12224072217941284, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0555, "frac_reward_zero_std": 0.0, "grad_norm": 4.209381580352783, "kl": 1.7662896811962128, "learning_rate": 4.3333333333333334e-05, "loss": 0.0707, "num_tokens": 275814.0, "reward": 0.027455374598503113, "reward_std": 0.4914630353450775, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.2016369253396988, "rewards/env_reward/std": 0.3276420533657074, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.056, "frac_reward_zero_std": 0.0, "grad_norm": 3.021775960922241, "kl": 2.54108564555645, "learning_rate": 4.3222222222222226e-05, "loss": 0.1016, "num_tokens": 278282.0, "reward": -0.2888333201408386, "reward_std": 0.15247361361980438, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.009222209453582764, "rewards/env_reward/std": 0.10164907574653625, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 12.666666984558105, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0565, "frac_reward_zero_std": 0.0, "grad_norm": 4.275335311889648, "kl": 1.6538867950439453, "learning_rate": 4.311111111111111e-05, "loss": 0.0662, "num_tokens": 280752.0, "reward": 0.437593936920166, "reward_std": 0.39731013774871826, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.47506263852119446, "rewards/env_reward/std": 0.2648734450340271, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.057, "frac_reward_zero_std": 0.0, "grad_norm": 6.433754920959473, "kl": 1.647656962275505, "learning_rate": 4.3e-05, "loss": 0.0659, "num_tokens": 283204.0, "reward": -1.3276481628417969, "reward_std": 2.414963722229004, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6725987792015076, "rewards/env_reward/std": 1.551644206047058, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 10.333333969116211, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0575, "frac_reward_zero_std": 0.0, "grad_norm": 2.6330208778381348, "kl": 1.515267439186573, "learning_rate": 4.2888888888888886e-05, "loss": 0.0606, "num_tokens": 285667.0, "reward": -3.7085390090942383, "reward_std": 2.482921838760376, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.201525926589966, "rewards/env_reward/std": 1.5969480276107788, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.058, "frac_reward_zero_std": 0.0, "grad_norm": 2.3021349906921387, "kl": 1.7087249606847763, "learning_rate": 4.277777777777778e-05, "loss": 0.0683, "num_tokens": 288114.0, "reward": 0.5330584049224854, "reward_std": 0.35837167501449585, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5387057065963745, "rewards/env_reward/std": 0.23891450464725494, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.0585, "frac_reward_zero_std": 0.0, "grad_norm": 5.497950553894043, "kl": 2.1271141320466995, "learning_rate": 4.266666666666667e-05, "loss": 0.0851, "num_tokens": 290540.0, "reward": 0.4702581763267517, "reward_std": 0.44036781787872314, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.4968388080596924, "rewards/env_reward/std": 0.293578565120697, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.059, "frac_reward_zero_std": 0.0, "grad_norm": 4.789796829223633, "kl": 1.4464631527662277, "learning_rate": 4.255555555555556e-05, "loss": 0.0579, "num_tokens": 293023.0, "reward": -0.7475403547286987, "reward_std": 2.8080575466156006, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.0703703761100769, "rewards/belief_accuracy/std": 0.059259265661239624, "rewards/env_reward/mean": -0.3492862284183502, "rewards/env_reward/std": 1.7720966339111328, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0595, "frac_reward_zero_std": 1.0, "grad_norm": 1.5602530241012573, "kl": 1.6833766214549541, "learning_rate": 4.2444444444444445e-05, "loss": 0.0673, "num_tokens": 295460.0, "reward": -0.42602595686912537, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.10068397223949432, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.06, "frac_reward_zero_std": 0.0, "grad_norm": 4.7347092628479, "kl": 1.445710226893425, "learning_rate": 4.233333333333334e-05, "loss": 0.0578, "num_tokens": 297907.0, "reward": -1.0152814388275146, "reward_std": 2.623145580291748, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.4643542170524597, "rewards/env_reward/std": 1.6904305219650269, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0605, "frac_reward_zero_std": 0.0, "grad_norm": 6.493462085723877, "kl": 1.4399118982255459, "learning_rate": 4.222222222222222e-05, "loss": 0.0576, "num_tokens": 300352.0, "reward": -0.20259986817836761, "reward_std": 0.06754998862743378, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.048266757279634476, "rewards/env_reward/std": 0.04503332078456879, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.061, "frac_reward_zero_std": 0.0, "grad_norm": 2.4330313205718994, "kl": 1.4334093481302261, "learning_rate": 4.211111111111111e-05, "loss": 0.0573, "num_tokens": 302825.0, "reward": -0.5075480937957764, "reward_std": 0.6749432682991028, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.15503208339214325, "rewards/env_reward/std": 0.44996219873428345, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.0615, "frac_reward_zero_std": 0.0, "grad_norm": 4.5208353996276855, "kl": 1.7931447178125381, "learning_rate": 4.2e-05, "loss": 0.0717, "num_tokens": 305291.0, "reward": -2.456667900085449, "reward_std": 2.895482063293457, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.3961119651794434, "rewards/env_reward/std": 1.863360047340393, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.062, "frac_reward_zero_std": 0.0, "grad_norm": 2.7689969539642334, "kl": 0.8019402623176575, "learning_rate": 4.188888888888889e-05, "loss": 0.0321, "num_tokens": 307775.0, "reward": -1.4924118518829346, "reward_std": 2.360426187515259, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.782441258430481, "rewards/env_reward/std": 1.5167045593261719, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0625, "frac_reward_zero_std": 0.0, "grad_norm": 3.6446032524108887, "kl": 1.4752652198076248, "learning_rate": 4.177777777777778e-05, "loss": 0.059, "num_tokens": 310203.0, "reward": 0.8273366689682007, "reward_std": 0.6383920311927795, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.7348911762237549, "rewards/env_reward/std": 0.4255947172641754, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.063, "frac_reward_zero_std": 0.0, "grad_norm": 5.112602233886719, "kl": 1.6532337069511414, "learning_rate": 4.166666666666667e-05, "loss": 0.0661, "num_tokens": 312653.0, "reward": -0.10410824418067932, "reward_std": 0.01273045688867569, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.11392784118652344, "rewards/env_reward/std": 0.008486974984407425, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0635, "frac_reward_zero_std": 0.0, "grad_norm": 4.351461887359619, "kl": 2.1480718851089478, "learning_rate": 4.155555555555556e-05, "loss": 0.0859, "num_tokens": 315110.0, "reward": -0.3739127516746521, "reward_std": 0.14415279030799866, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.06594181805849075, "rewards/env_reward/std": 0.09610186517238617, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.064, "frac_reward_zero_std": 0.0, "grad_norm": 3.8212032318115234, "kl": 1.534121721982956, "learning_rate": 4.144444444444445e-05, "loss": 0.0614, "num_tokens": 317584.0, "reward": -0.19062533974647522, "reward_std": 0.3150945007801056, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.0893678218126297, "rewards/belief_accuracy/std": 0.021264348179101944, "rewards/env_reward/mean": 0.03081876039505005, "rewards/env_reward/std": 0.24891482293605804, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0645, "frac_reward_zero_std": 0.0, "grad_norm": 4.938203811645508, "kl": 2.1303387582302094, "learning_rate": 4.133333333333333e-05, "loss": 0.0852, "num_tokens": 320029.0, "reward": -1.1215572357177734, "reward_std": 2.5629398822784424, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.06083333492279053, "rewards/belief_accuracy/std": 0.07833334058523178, "rewards/env_reward/mean": -0.6177048683166504, "rewards/env_reward/std": 1.588196873664856, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.065, "frac_reward_zero_std": 0.0, "grad_norm": 5.400176048278809, "kl": 1.8505046516656876, "learning_rate": 4.1222222222222224e-05, "loss": 0.074, "num_tokens": 322471.0, "reward": -0.12519629299640656, "reward_std": 0.14092496037483215, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.09986913949251175, "rewards/env_reward/std": 0.09394997358322144, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.0655, "frac_reward_zero_std": 0.0, "grad_norm": 3.194300413131714, "kl": 1.2793779149651527, "learning_rate": 4.111111111111111e-05, "loss": 0.0512, "num_tokens": 324940.0, "reward": -1.0001001358032227, "reward_std": 2.700178623199463, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.45423343777656555, "rewards/env_reward/std": 1.743279218673706, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.066, "frac_reward_zero_std": 0.0, "grad_norm": 4.973412036895752, "kl": 1.9231543093919754, "learning_rate": 4.1e-05, "loss": 0.0769, "num_tokens": 327385.0, "reward": -1.7763125896453857, "reward_std": 2.129706382751465, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.9717084169387817, "rewards/env_reward/std": 1.3618686199188232, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 7.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0665, "frac_reward_zero_std": 0.0, "grad_norm": 2.070490598678589, "kl": 1.150221362709999, "learning_rate": 4.088888888888889e-05, "loss": 0.046, "num_tokens": 329838.0, "reward": 0.4534025192260742, "reward_std": 1.0941553115844727, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.4856016933917999, "rewards/env_reward/std": 0.7294369339942932, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 10.333333969116211, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.067, "frac_reward_zero_std": 0.0, "grad_norm": 1.878156065940857, "kl": 1.4795889034867287, "learning_rate": 4.0777777777777783e-05, "loss": 0.0592, "num_tokens": 332301.0, "reward": -2.0403780937194824, "reward_std": 3.3793559074401855, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.1185853481292725, "rewards/env_reward/std": 2.1859495639801025, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0675, "frac_reward_zero_std": 1.0, "grad_norm": 0.1908632516860962, "kl": 1.9114599525928497, "learning_rate": 4.066666666666667e-05, "loss": 0.0765, "num_tokens": 334745.0, "reward": 0.5449367761611938, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5466245412826538, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 10.666666984558105, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.068, "frac_reward_zero_std": 0.0, "grad_norm": 3.5480082035064697, "kl": 1.5377977713942528, "learning_rate": 4.055555555555556e-05, "loss": 0.0615, "num_tokens": 337209.0, "reward": -1.2076761722564697, "reward_std": 2.4953103065490723, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5926175117492676, "rewards/env_reward/std": 1.6052173376083374, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0685, "frac_reward_zero_std": 0.0, "grad_norm": 1.0419105291366577, "kl": 1.1934361532330513, "learning_rate": 4.0444444444444444e-05, "loss": 0.0477, "num_tokens": 339665.0, "reward": -1.1166263818740845, "reward_std": 2.606243848800659, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5319175720214844, "rewards/env_reward/std": 1.6803356409072876, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.069, "frac_reward_zero_std": 0.0, "grad_norm": 3.009521007537842, "kl": 2.4323032796382904, "learning_rate": 4.0333333333333336e-05, "loss": 0.0973, "num_tokens": 342138.0, "reward": 0.6563852429389954, "reward_std": 0.8735789656639099, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.057500001043081284, "rewards/belief_accuracy/std": 0.08499999344348907, "rewards/env_reward/mean": 0.5317568182945251, "rewards/env_reward/std": 0.6170323491096497, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.0695, "frac_reward_zero_std": 0.0, "grad_norm": 3.7213194370269775, "kl": 3.0655910074710846, "learning_rate": 4.022222222222222e-05, "loss": 0.1226, "num_tokens": 344573.0, "reward": 0.5249032378196716, "reward_std": 0.1243140697479248, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5332688093185425, "rewards/env_reward/std": 0.0828760415315628, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.07, "frac_reward_zero_std": 1.0, "grad_norm": 0.32563015818595886, "kl": 2.0985984057188034, "learning_rate": 4.011111111111111e-05, "loss": 0.0839, "num_tokens": 347006.0, "reward": -0.07693907618522644, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.1320406198501587, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0964287742972374, "kl": 1.2693939208984375, "learning_rate": 4e-05, "loss": 0.0508, "num_tokens": 349465.0, "reward": -0.02015012502670288, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.16989992558956146, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 17.666667938232422, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.071, "frac_reward_zero_std": 1.0, "grad_norm": 0.09562604129314423, "kl": 1.128716617822647, "learning_rate": 3.9888888888888895e-05, "loss": 0.0451, "num_tokens": 351950.0, "reward": 0.7584548592567444, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.6889699697494507, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0715, "frac_reward_zero_std": 0.0, "grad_norm": 4.745733261108398, "kl": 0.7463721930980682, "learning_rate": 3.977777777777778e-05, "loss": 0.0299, "num_tokens": 354455.0, "reward": 0.3327590823173523, "reward_std": 0.23017629981040955, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.405172735452652, "rewards/env_reward/std": 0.1534508764743805, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.072, "frac_reward_zero_std": 0.0, "grad_norm": 4.888156890869141, "kl": 1.4033671617507935, "learning_rate": 3.966666666666667e-05, "loss": 0.0561, "num_tokens": 356946.0, "reward": -0.13394379615783691, "reward_std": 0.41236963868141174, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.09403747320175171, "rewards/env_reward/std": 0.27491310238838196, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0725, "frac_reward_zero_std": 0.0, "grad_norm": 3.65702486038208, "kl": 1.6139360815286636, "learning_rate": 3.9555555555555556e-05, "loss": 0.0646, "num_tokens": 359381.0, "reward": -1.7238547801971436, "reward_std": 2.1609649658203125, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.12072296440601349, "rewards/belief_accuracy/std": 0.07126190513372421, "rewards/env_reward/mean": -0.9077907204627991, "rewards/env_reward/std": 1.3948062658309937, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.073, "frac_reward_zero_std": 0.0, "grad_norm": 2.8972437381744385, "kl": 1.160056695342064, "learning_rate": 3.944444444444445e-05, "loss": 0.0464, "num_tokens": 361855.0, "reward": 0.13612942397594452, "reward_std": 0.02916666865348816, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.0972222238779068, "rewards/belief_accuracy/std": 0.0055555556900799274, "rewards/env_reward/mean": 0.2643640637397766, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0735, "frac_reward_zero_std": 0.0, "grad_norm": 4.382909774780273, "kl": 1.568796619772911, "learning_rate": 3.933333333333333e-05, "loss": 0.0628, "num_tokens": 364331.0, "reward": 0.800137996673584, "reward_std": 0.3069959282875061, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.7167587280273438, "rewards/env_reward/std": 0.2046639323234558, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 19.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.074, "frac_reward_zero_std": 0.0, "grad_norm": 8.64746379852295, "kl": 0.8096725344657898, "learning_rate": 3.922222222222223e-05, "loss": 0.0324, "num_tokens": 366833.0, "reward": -0.15847638249397278, "reward_std": 1.3163249492645264, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.07768243551254272, "rewards/env_reward/std": 0.877549946308136, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0745, "frac_reward_zero_std": 0.0, "grad_norm": 5.473668098449707, "kl": 2.7600976526737213, "learning_rate": 3.9111111111111115e-05, "loss": 0.1104, "num_tokens": 369284.0, "reward": -2.9569857120513916, "reward_std": 2.3198444843292236, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.7296571731567383, "rewards/env_reward/std": 1.4797673225402832, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.075, "frac_reward_zero_std": 0.0, "grad_norm": 3.437213897705078, "kl": 1.6173148602247238, "learning_rate": 3.9000000000000006e-05, "loss": 0.0647, "num_tokens": 371770.0, "reward": 0.5740416049957275, "reward_std": 0.233365997672081, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.13980931043624878, "rewards/belief_accuracy/std": 0.07961863279342651, "rewards/env_reward/mean": 0.6414797306060791, "rewards/env_reward/std": 0.1132500022649765, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0755, "frac_reward_zero_std": 1.0, "grad_norm": 0.2341485172510147, "kl": 1.3624602407217026, "learning_rate": 3.888888888888889e-05, "loss": 0.0545, "num_tokens": 374253.0, "reward": 0.2103109359741211, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3235406279563904, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 23.33333396911621, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.076, "frac_reward_zero_std": 0.0, "grad_norm": 3.359372615814209, "kl": 1.1529072970151901, "learning_rate": 3.877777777777778e-05, "loss": 0.0461, "num_tokens": 376755.0, "reward": 0.4146992564201355, "reward_std": 0.46390998363494873, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.45979946851730347, "rewards/env_reward/std": 0.3092733323574066, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0765, "frac_reward_zero_std": 0.0, "grad_norm": 5.655915260314941, "kl": 1.3275744514539838, "learning_rate": 3.866666666666667e-05, "loss": 0.0531, "num_tokens": 379211.0, "reward": -0.0012441501021385193, "reward_std": 0.24833057820796967, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.23028355836868286, "rewards/belief_accuracy/std": 0.09518812596797943, "rewards/env_reward/mean": 0.42640429735183716, "rewards/env_reward/std": 0.10868140310049057, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.077, "frac_reward_zero_std": 0.0, "grad_norm": 3.993246555328369, "kl": 1.3614933341741562, "learning_rate": 3.855555555555556e-05, "loss": 0.0545, "num_tokens": 381699.0, "reward": -0.3912268280982971, "reward_std": 3.040301561355591, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": -0.01915118098258972, "rewards/env_reward/std": 1.9872325658798218, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0775, "frac_reward_zero_std": 1.0, "grad_norm": 0.0937976986169815, "kl": 1.2733041644096375, "learning_rate": 3.844444444444444e-05, "loss": 0.0509, "num_tokens": 384182.0, "reward": -0.1396826207637787, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.09021158516407013, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 21.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.078, "frac_reward_zero_std": 0.0, "grad_norm": 4.676441669464111, "kl": 0.9261074624955654, "learning_rate": 3.8333333333333334e-05, "loss": 0.037, "num_tokens": 386677.0, "reward": -1.3367525339126587, "reward_std": 2.411214828491211, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6786683797836304, "rewards/env_reward/std": 1.5492030382156372, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0785, "frac_reward_zero_std": 0.0, "grad_norm": 2.1922104358673096, "kl": 0.0721854604780674, "learning_rate": 3.8222222222222226e-05, "loss": 0.0029, "num_tokens": 389109.0, "reward": 0.8821967244148254, "reward_std": 0.15713486075401306, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.31481480598449707, "rewards/belief_accuracy/std": 0.052378278225660324, "rewards/env_reward/mean": 1.1844274997711182, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.079, "frac_reward_zero_std": 0.0, "grad_norm": 4.775229454040527, "kl": 1.4617139548063278, "learning_rate": 3.811111111111112e-05, "loss": 0.0585, "num_tokens": 391575.0, "reward": -0.6259194612503052, "reward_std": 0.5253891348838806, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10740740597248077, "rewards/belief_accuracy/std": 0.014814812690019608, "rewards/env_reward/mean": -0.22329813241958618, "rewards/env_reward/std": 0.3683049976825714, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 18.33333396911621, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0795, "frac_reward_zero_std": 0.0, "grad_norm": 2.7022299766540527, "kl": 1.3450734540820122, "learning_rate": 3.8e-05, "loss": 0.0538, "num_tokens": 394062.0, "reward": -1.487056851387024, "reward_std": 2.3527774810791016, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.0871676579117775, "rewards/belief_accuracy/std": 0.025664685294032097, "rewards/env_reward/mean": -0.8087027072906494, "rewards/env_reward/std": 1.5039740800857544, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 21.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.08, "frac_reward_zero_std": 0.0, "grad_norm": 4.860130310058594, "kl": 1.0795547626912594, "learning_rate": 3.7888888888888894e-05, "loss": 0.0432, "num_tokens": 396557.0, "reward": -0.01648128777742386, "reward_std": 0.3920603096485138, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.1723458170890808, "rewards/env_reward/std": 0.2613735496997833, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 18.666667938232422, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0805, "frac_reward_zero_std": 0.0, "grad_norm": 3.8319482803344727, "kl": 1.2551886662840843, "learning_rate": 3.777777777777778e-05, "loss": 0.0502, "num_tokens": 399045.0, "reward": 0.17859038710594177, "reward_std": 0.8184104561805725, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3023936152458191, "rewards/env_reward/std": 0.5456069707870483, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.081, "frac_reward_zero_std": 0.0, "grad_norm": 2.120696783065796, "kl": 0.8586160615086555, "learning_rate": 3.766666666666667e-05, "loss": 0.0343, "num_tokens": 401559.0, "reward": 0.4954003691673279, "reward_std": 0.3572309911251068, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10083333402872086, "rewards/belief_accuracy/std": 0.0016666651936247945, "rewards/env_reward/mean": 0.5111002922058105, "rewards/env_reward/std": 0.24086414277553558, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0815, "frac_reward_zero_std": 0.0, "grad_norm": 8.276159286499023, "kl": 2.0177499651908875, "learning_rate": 3.7555555555555554e-05, "loss": 0.0807, "num_tokens": 404022.0, "reward": -0.14451055228710175, "reward_std": 0.07916668057441711, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.08888889104127884, "rewards/belief_accuracy/std": 0.02222222276031971, "rewards/env_reward/mean": 0.06060408055782318, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.082, "frac_reward_zero_std": 0.0, "grad_norm": 5.653500080108643, "kl": 1.8464947640895844, "learning_rate": 3.7444444444444446e-05, "loss": 0.0739, "num_tokens": 406485.0, "reward": 0.27019041776657104, "reward_std": 0.23719999194145203, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11416666209697723, "rewards/belief_accuracy/std": 0.028333332389593124, "rewards/env_reward/mean": 0.3876269459724426, "rewards/env_reward/std": 0.10980000346899033, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 19.33333396911621, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0825, "frac_reward_zero_std": 1.0, "grad_norm": 0.24756693840026855, "kl": 1.6211326867341995, "learning_rate": 3.733333333333334e-05, "loss": 0.0648, "num_tokens": 408975.0, "reward": 0.03895732760429382, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.20930489897727966, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 19.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.083, "frac_reward_zero_std": 0.0, "grad_norm": 3.162034034729004, "kl": 1.216068983078003, "learning_rate": 3.722222222222222e-05, "loss": 0.0486, "num_tokens": 411464.0, "reward": -0.5613082051277161, "reward_std": 0.08749997615814209, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": -0.16170544922351837, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0835, "frac_reward_zero_std": 0.0, "grad_norm": 6.356938362121582, "kl": 2.8624762892723083, "learning_rate": 3.7111111111111113e-05, "loss": 0.1145, "num_tokens": 413907.0, "reward": 0.14997538924217224, "reward_std": 0.7540647387504578, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.2833169400691986, "rewards/env_reward/std": 0.5027098655700684, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.084, "frac_reward_zero_std": 0.0, "grad_norm": 4.261693477630615, "kl": 1.933813601732254, "learning_rate": 3.7e-05, "loss": 0.0774, "num_tokens": 416402.0, "reward": -0.05508837103843689, "reward_std": 0.11999882757663727, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.09027226269245148, "rewards/belief_accuracy/std": 0.019455470144748688, "rewards/env_reward/mean": 0.12298562377691269, "rewards/env_reward/std": 0.08219999819993973, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0845, "frac_reward_zero_std": 0.0, "grad_norm": 5.13167142868042, "kl": 2.3502594381570816, "learning_rate": 3.688888888888889e-05, "loss": 0.094, "num_tokens": 418887.0, "reward": 0.03691243380308151, "reward_std": 0.08749999105930328, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": 0.2371082901954651, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 15.333333969116211, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.085, "frac_reward_zero_std": 0.0, "grad_norm": 3.1629037857055664, "kl": 1.5842487215995789, "learning_rate": 3.677777777777778e-05, "loss": 0.0634, "num_tokens": 421365.0, "reward": 0.5254287719726562, "reward_std": 0.11898240447044373, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.533619225025177, "rewards/env_reward/std": 0.07932159304618835, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0855, "frac_reward_zero_std": 0.0, "grad_norm": 3.2538228034973145, "kl": 1.3759911209344864, "learning_rate": 3.6666666666666666e-05, "loss": 0.055, "num_tokens": 423851.0, "reward": 1.0300487279891968, "reward_std": 0.04658208787441254, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.1066666692495346, "rewards/belief_accuracy/std": 0.013333332724869251, "rewards/env_reward/mean": 0.8791991472244263, "rewards/env_reward/std": 0.01968872733414173, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.086, "frac_reward_zero_std": 1.0, "grad_norm": 0.15501928329467773, "kl": 1.7516742050647736, "learning_rate": 3.655555555555556e-05, "loss": 0.0701, "num_tokens": 426322.0, "reward": 0.030182331800460815, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.2034548968076706, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0865, "frac_reward_zero_std": 1.0, "grad_norm": 37.33258056640625, "kl": 8.386772617697716, "learning_rate": 3.644444444444445e-05, "loss": 0.3355, "num_tokens": 428763.0, "reward": -0.9025059342384338, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.41833725571632385, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.087, "frac_reward_zero_std": 1.0, "grad_norm": 0.22313672304153442, "kl": 1.8691215515136719, "learning_rate": 3.633333333333333e-05, "loss": 0.0748, "num_tokens": 431237.0, "reward": -0.16483666002750397, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.07344222813844681, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0875, "frac_reward_zero_std": 0.0, "grad_norm": 4.1793341636657715, "kl": 2.593918561935425, "learning_rate": 3.6222222222222225e-05, "loss": 0.1038, "num_tokens": 433710.0, "reward": -0.9837551116943359, "reward_std": 2.6553149223327637, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.44333672523498535, "rewards/env_reward/std": 1.712130069732666, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.088, "frac_reward_zero_std": 1.0, "grad_norm": 0.10414294898509979, "kl": 1.4019053727388382, "learning_rate": 3.611111111111111e-05, "loss": 0.0561, "num_tokens": 436184.0, "reward": -0.12919571995735168, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.0972028523683548, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0885, "frac_reward_zero_std": 0.0, "grad_norm": 3.139967441558838, "kl": 1.0769911333918571, "learning_rate": 3.6e-05, "loss": 0.0431, "num_tokens": 438657.0, "reward": 0.6967830657958984, "reward_std": 0.08670443296432495, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.6478554010391235, "rewards/env_reward/std": 0.05780297517776489, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 10.666666984558105, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.089, "frac_reward_zero_std": 0.0, "grad_norm": 3.0620081424713135, "kl": 1.0732092261314392, "learning_rate": 3.5888888888888886e-05, "loss": 0.0429, "num_tokens": 441121.0, "reward": -1.2789283990859985, "reward_std": 2.598663568496704, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6401189565658569, "rewards/env_reward/std": 1.6776195764541626, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0895, "frac_reward_zero_std": 1.0, "grad_norm": 0.18922147154808044, "kl": 0.8863924369215965, "learning_rate": 3.577777777777778e-05, "loss": 0.0355, "num_tokens": 443635.0, "reward": 1.347588062286377, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.0817253589630127, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.09, "frac_reward_zero_std": 0.0, "grad_norm": 3.3570539951324463, "kl": 1.9235362261533737, "learning_rate": 3.566666666666667e-05, "loss": 0.0769, "num_tokens": 446101.0, "reward": 0.05883501470088959, "reward_std": 0.5488622784614563, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.2225566804409027, "rewards/env_reward/std": 0.3659081757068634, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 17.33333396911621, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0905, "frac_reward_zero_std": 1.0, "grad_norm": 0.08723417669534683, "kl": 1.3284604251384735, "learning_rate": 3.555555555555556e-05, "loss": 0.0531, "num_tokens": 448585.0, "reward": 0.6743147373199463, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.6328765153884888, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 9.666666984558105, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.091, "frac_reward_zero_std": 0.0, "grad_norm": 3.3371059894561768, "kl": 1.4546705782413483, "learning_rate": 3.5444444444444445e-05, "loss": 0.0582, "num_tokens": 451046.0, "reward": -0.3032863438129425, "reward_std": 0.33148258924484253, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.018857555463910103, "rewards/env_reward/std": 0.22098839282989502, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0915, "frac_reward_zero_std": 0.0, "grad_norm": 2.911696195602417, "kl": 1.4696582406759262, "learning_rate": 3.5333333333333336e-05, "loss": 0.0588, "num_tokens": 453544.0, "reward": -2.4303359985351562, "reward_std": 2.9513607025146484, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.3785573244094849, "rewards/env_reward/std": 1.9012062549591064, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 13.333333969116211, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.092, "frac_reward_zero_std": 0.0, "grad_norm": 3.6932320594787598, "kl": 1.533248096704483, "learning_rate": 3.522222222222222e-05, "loss": 0.0613, "num_tokens": 456016.0, "reward": -1.5931193828582764, "reward_std": 3.8826847076416016, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.8204129934310913, "rewards/env_reward/std": 2.521214485168457, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0925, "frac_reward_zero_std": 0.0, "grad_norm": 4.075229167938232, "kl": 1.968793198466301, "learning_rate": 3.511111111111111e-05, "loss": 0.0788, "num_tokens": 458465.0, "reward": -0.23590603470802307, "reward_std": 0.2219301015138626, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.026062656193971634, "rewards/env_reward/std": 0.14795339107513428, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.093, "frac_reward_zero_std": 1.0, "grad_norm": 2.784778594970703, "kl": 2.159162014722824, "learning_rate": 3.5e-05, "loss": 0.0864, "num_tokens": 460898.0, "reward": -0.3167000114917755, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.027799999341368675, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 26.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.0935, "frac_reward_zero_std": 0.0, "grad_norm": 3.384214162826538, "kl": 1.2219679579138756, "learning_rate": 3.4888888888888895e-05, "loss": 0.0489, "num_tokens": 463415.0, "reward": -1.0766644477844238, "reward_std": 2.582223653793335, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5052763223648071, "rewards/env_reward/std": 1.6631492376327515, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.094, "frac_reward_zero_std": 0.0, "grad_norm": 3.3013856410980225, "kl": 1.2444797977805138, "learning_rate": 3.477777777777778e-05, "loss": 0.0498, "num_tokens": 465874.0, "reward": -2.264209508895874, "reward_std": 3.1193079948425293, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.2678064107894897, "rewards/env_reward/std": 2.0125834941864014, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.0945, "frac_reward_zero_std": 0.0, "grad_norm": 2.398057460784912, "kl": 1.1671398282051086, "learning_rate": 3.466666666666667e-05, "loss": 0.0467, "num_tokens": 468342.0, "reward": -0.32055363059043884, "reward_std": 0.08688756823539734, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.03036908432841301, "rewards/env_reward/std": 0.05792504921555519, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 7.333333492279053, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.095, "frac_reward_zero_std": 0.0, "grad_norm": 2.1980197429656982, "kl": 1.4500057846307755, "learning_rate": 3.4555555555555556e-05, "loss": 0.058, "num_tokens": 470796.0, "reward": 0.38532906770706177, "reward_std": 0.15841148793697357, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.4402194023132324, "rewards/env_reward/std": 0.10560767352581024, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0955, "frac_reward_zero_std": 0.0, "grad_norm": 2.9431092739105225, "kl": 1.4747809767723083, "learning_rate": 3.444444444444445e-05, "loss": 0.059, "num_tokens": 473289.0, "reward": -1.0166206359863281, "reward_std": 0.02063235081732273, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.49441370368003845, "rewards/env_reward/std": 0.0137548903003335, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.096, "frac_reward_zero_std": 0.0, "grad_norm": 7.505349159240723, "kl": 1.3912545293569565, "learning_rate": 3.433333333333333e-05, "loss": 0.0557, "num_tokens": 475757.0, "reward": -0.6259548664093018, "reward_std": 2.8915553092956543, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.08980958163738251, "rewards/belief_accuracy/std": 0.02038082852959633, "rewards/env_reward/mean": -0.22935077548027039, "rewards/env_reward/std": 1.8575823307037354, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 16.666667938232422, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0965, "frac_reward_zero_std": 0.0, "grad_norm": 2.4287314414978027, "kl": 1.0645422227680683, "learning_rate": 3.4222222222222224e-05, "loss": 0.0426, "num_tokens": 478239.0, "reward": -1.1361416578292847, "reward_std": 2.5584716796875, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.18546631932258606, "rewards/belief_accuracy/std": 0.059523556381464005, "rewards/env_reward/mean": -0.3864951729774475, "rewards/env_reward/std": 1.7521181106567383, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.097, "frac_reward_zero_std": 0.0, "grad_norm": 2.376044511795044, "kl": 0.6852857172489166, "learning_rate": 3.411111111111111e-05, "loss": 0.0274, "num_tokens": 480748.0, "reward": 0.5333235263824463, "reward_std": 0.08749997615814209, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": 0.5680490136146545, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0975, "frac_reward_zero_std": 0.0, "grad_norm": 3.4659929275512695, "kl": 1.7528847455978394, "learning_rate": 3.4000000000000007e-05, "loss": 0.0701, "num_tokens": 483203.0, "reward": 1.018233299255371, "reward_std": 0.009551048278808594, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.862155556678772, "rewards/env_reward/std": 0.006367385853081942, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.098, "frac_reward_zero_std": 0.0, "grad_norm": 7.004715442657471, "kl": 0.813011210411787, "learning_rate": 3.388888888888889e-05, "loss": 0.0325, "num_tokens": 485711.0, "reward": -2.4635062217712402, "reward_std": 2.8720784187316895, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.4006710052490234, "rewards/env_reward/std": 1.8473838567733765, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0985, "frac_reward_zero_std": 0.0, "grad_norm": 3.282201051712036, "kl": 0.7132957000285387, "learning_rate": 3.377777777777778e-05, "loss": 0.0285, "num_tokens": 488216.0, "reward": -3.926431894302368, "reward_std": 2.047135829925537, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.346787929534912, "rewards/env_reward/std": 1.3064239025115967, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.099, "frac_reward_zero_std": 0.0, "grad_norm": 1.8574674129486084, "kl": 1.1133069694042206, "learning_rate": 3.366666666666667e-05, "loss": 0.0445, "num_tokens": 490705.0, "reward": 0.6392979621887207, "reward_std": 0.2728678584098816, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.044744670391082764, "rewards/belief_accuracy/std": 0.11051066219806671, "rewards/env_reward/mean": 0.4948546886444092, "rewards/env_reward/std": 0.41126659512519836, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0995, "frac_reward_zero_std": 0.0, "grad_norm": 2.2241978645324707, "kl": 1.400051310658455, "learning_rate": 3.355555555555556e-05, "loss": 0.056, "num_tokens": 493194.0, "reward": 0.16355225443840027, "reward_std": 0.2303662747144699, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.29236820340156555, "rewards/env_reward/std": 0.153577521443367, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 27.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1, "frac_reward_zero_std": 0.0, "grad_norm": 3.1241228580474854, "kl": 0.7815838046371937, "learning_rate": 3.3444444444444443e-05, "loss": 0.0313, "num_tokens": 495713.0, "reward": -1.4204142093658447, "reward_std": 2.6858582496643066, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.7344428300857544, "rewards/env_reward/std": 1.739694356918335, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1005, "frac_reward_zero_std": 0.0, "grad_norm": 3.880967617034912, "kl": 1.6194000542163849, "learning_rate": 3.3333333333333335e-05, "loss": 0.0648, "num_tokens": 498186.0, "reward": -1.1433579921722412, "reward_std": 2.5394091606140137, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5497387051582336, "rewards/env_reward/std": 1.6346454620361328, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.101, "frac_reward_zero_std": 0.0, "grad_norm": 3.6157476902008057, "kl": 1.4809669330716133, "learning_rate": 3.322222222222222e-05, "loss": 0.0592, "num_tokens": 500648.0, "reward": -0.7693363428115845, "reward_std": 2.7953262329101562, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.0716666653752327, "rewards/belief_accuracy/std": 0.05666666850447655, "rewards/env_reward/mean": -0.3612242341041565, "rewards/env_reward/std": 1.7597646713256836, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.1015, "frac_reward_zero_std": 0.0, "grad_norm": 2.5044844150543213, "kl": 0.5702618137001991, "learning_rate": 3.311111111111112e-05, "loss": 0.0228, "num_tokens": 503079.0, "reward": -0.09709322452545166, "reward_std": 0.09302432835102081, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.1680680364370346, "rewards/belief_accuracy/std": 0.03100811131298542, "rewards/env_reward/mean": 0.2380739152431488, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.102, "frac_reward_zero_std": 0.0, "grad_norm": 2.225451707839966, "kl": 0.6446680650115013, "learning_rate": 3.3e-05, "loss": 0.0258, "num_tokens": 505599.0, "reward": -1.0992940664291382, "reward_std": 2.570491075515747, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": -0.49119603633880615, "rewards/env_reward/std": 1.673166036605835, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1025, "frac_reward_zero_std": 0.0, "grad_norm": 11.69345760345459, "kl": 0.9406535923480988, "learning_rate": 3.2888888888888894e-05, "loss": 0.0376, "num_tokens": 508093.0, "reward": -1.0868068933486938, "reward_std": 2.6226813793182373, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5120379328727722, "rewards/env_reward/std": 1.6912070512771606, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.103, "frac_reward_zero_std": 0.0, "grad_norm": 3.8141772747039795, "kl": 0.38117800280451775, "learning_rate": 3.277777777777778e-05, "loss": 0.0152, "num_tokens": 510301.0, "reward": 0.5716937780380249, "reward_std": 0.2175557017326355, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.34779584407806396, "rewards/env_reward/std": 0.14503712952136993, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 18.33333396911621, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1035, "frac_reward_zero_std": 0.0, "grad_norm": 2.4618332386016846, "kl": 1.3801769241690636, "learning_rate": 3.266666666666667e-05, "loss": 0.0552, "num_tokens": 512788.0, "reward": -1.348587989807129, "reward_std": 2.476418972015381, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6865587830543518, "rewards/env_reward/std": 1.5944546461105347, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.104, "frac_reward_zero_std": 0.0, "grad_norm": 7.705834865570068, "kl": 1.0085995495319366, "learning_rate": 3.2555555555555555e-05, "loss": 0.0403, "num_tokens": 515256.0, "reward": -1.0385560989379883, "reward_std": 2.6225454807281494, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.4798707962036133, "rewards/env_reward/std": 1.6903735399246216, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 11.333333969116211, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1045, "frac_reward_zero_std": 0.0, "grad_norm": 5.924132823944092, "kl": 1.7609535232186317, "learning_rate": 3.2444444444444446e-05, "loss": 0.0704, "num_tokens": 517722.0, "reward": -1.3413997888565063, "reward_std": 2.4143919944763184, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6817665696144104, "rewards/env_reward/std": 1.5514785051345825, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.105, "frac_reward_zero_std": 0.0, "grad_norm": 7.369988918304443, "kl": 1.1772667318582535, "learning_rate": 3.233333333333333e-05, "loss": 0.0471, "num_tokens": 520224.0, "reward": 0.25493913888931274, "reward_std": 0.33257579803466797, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3532927930355072, "rewards/env_reward/std": 0.2217172235250473, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1055, "frac_reward_zero_std": 0.0, "grad_norm": 2.5913333892822266, "kl": 0.7529645264148712, "learning_rate": 3.222222222222223e-05, "loss": 0.0301, "num_tokens": 522738.0, "reward": -1.3859096765518188, "reward_std": 2.4013755321502686, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.7114397883415222, "rewards/env_reward/std": 1.5432217121124268, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.106, "frac_reward_zero_std": 0.0, "grad_norm": 4.363038539886475, "kl": 1.2446223124861717, "learning_rate": 3.2111111111111114e-05, "loss": 0.0498, "num_tokens": 525236.0, "reward": -2.56288743019104, "reward_std": 2.7684178352355957, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.466925024986267, "rewards/env_reward/std": 1.7785577774047852, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1065, "frac_reward_zero_std": 0.0, "grad_norm": 4.272139549255371, "kl": 1.5829559713602066, "learning_rate": 3.2000000000000005e-05, "loss": 0.0633, "num_tokens": 527450.0, "reward": 1.3202344179153442, "reward_std": 0.7838823199272156, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.84682297706604, "rewards/env_reward/std": 0.5225882530212402, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 15.666666984558105, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.107, "frac_reward_zero_std": 0.0, "grad_norm": 5.658717632293701, "kl": 1.2298424392938614, "learning_rate": 3.188888888888889e-05, "loss": 0.0492, "num_tokens": 529929.0, "reward": -1.5011694431304932, "reward_std": 2.299220323562622, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.78827965259552, "rewards/env_reward/std": 1.474480390548706, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 20.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1075, "frac_reward_zero_std": 0.0, "grad_norm": 3.059485912322998, "kl": 0.9900188595056534, "learning_rate": 3.177777777777778e-05, "loss": 0.0396, "num_tokens": 532434.0, "reward": -2.2220005989074707, "reward_std": 3.1529808044433594, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.2396671772003174, "rewards/env_reward/std": 2.0346951484680176, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.108, "frac_reward_zero_std": 0.0, "grad_norm": 5.903228282928467, "kl": 0.9180602729320526, "learning_rate": 3.1666666666666666e-05, "loss": 0.0367, "num_tokens": 534946.0, "reward": 0.06937577575445175, "reward_std": 0.3579734265804291, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.05766364932060242, "rewards/belief_accuracy/std": 0.084672711789608, "rewards/env_reward/mean": 0.14074449241161346, "rewards/env_reward/std": 0.25905489921569824, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 16.666667938232422, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.1085, "frac_reward_zero_std": 0.0, "grad_norm": 7.5056562423706055, "kl": 1.428985133767128, "learning_rate": 3.155555555555556e-05, "loss": 0.0572, "num_tokens": 537428.0, "reward": -0.04300477355718613, "reward_std": 0.1483583301305771, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.15466348826885223, "rewards/env_reward/std": 0.09890555590391159, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.109, "frac_reward_zero_std": 0.0, "grad_norm": 2.7241830825805664, "kl": 0.9578761979937553, "learning_rate": 3.144444444444445e-05, "loss": 0.0383, "num_tokens": 539948.0, "reward": -1.9331963062286377, "reward_std": 2.0602900981903076, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.0716666653752327, "rewards/belief_accuracy/std": 0.05666666850447655, "rewards/env_reward/mean": -1.137130856513977, "rewards/env_reward/std": 1.2618883848190308, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1095, "frac_reward_zero_std": 0.0, "grad_norm": 4.32493782043457, "kl": 0.58867571875453, "learning_rate": 3.1333333333333334e-05, "loss": 0.0235, "num_tokens": 542468.0, "reward": -1.2151740789413452, "reward_std": 2.491729736328125, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5976160764694214, "rewards/env_reward/std": 1.6028647422790527, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.11, "frac_reward_zero_std": 0.0, "grad_norm": 1.791263461112976, "kl": 0.3568975552916527, "learning_rate": 3.1222222222222225e-05, "loss": 0.0143, "num_tokens": 544900.0, "reward": 0.03496697545051575, "reward_std": 0.06415002793073654, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.17913591861724854, "rewards/belief_accuracy/std": 0.021383339539170265, "rewards/env_reward/mean": 0.3482498526573181, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1105, "frac_reward_zero_std": 0.0, "grad_norm": 2.383763313293457, "kl": 0.8193067982792854, "learning_rate": 3.111111111111111e-05, "loss": 0.0328, "num_tokens": 547414.0, "reward": 1.203812599182129, "reward_std": 0.6176812648773193, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9858750700950623, "rewards/env_reward/std": 0.41178756952285767, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.111, "frac_reward_zero_std": 0.0, "grad_norm": 2.43298602104187, "kl": 1.0093542635440826, "learning_rate": 3.1e-05, "loss": 0.0404, "num_tokens": 549923.0, "reward": -0.0007572025060653687, "reward_std": 0.016494423151016235, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10740740597248077, "rewards/belief_accuracy/std": 0.014814812690019608, "rewards/env_reward/mean": 0.19347669184207916, "rewards/env_reward/std": 0.010300002992153168, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.1115, "frac_reward_zero_std": 0.0, "grad_norm": 6.343346118927002, "kl": 0.7701031491160393, "learning_rate": 3.088888888888889e-05, "loss": 0.0308, "num_tokens": 552439.0, "reward": -1.5342886447906494, "reward_std": 2.2832674980163574, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.08815178275108337, "rewards/belief_accuracy/std": 0.023696430027484894, "rewards/env_reward/mean": -0.8382222652435303, "rewards/env_reward/std": 1.4423881769180298, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.112, "frac_reward_zero_std": 0.0, "grad_norm": 3.0897319316864014, "kl": 0.8843832314014435, "learning_rate": 3.077777777777778e-05, "loss": 0.0354, "num_tokens": 554948.0, "reward": 0.1540832221508026, "reward_std": 0.3211406171321869, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.08613713085651398, "rewards/belief_accuracy/std": 0.02772573195397854, "rewards/env_reward/mean": 0.25416308641433716, "rewards/env_reward/std": 0.18371644616127014, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1125, "frac_reward_zero_std": 0.0, "grad_norm": 14.237890243530273, "kl": 0.8649509251117706, "learning_rate": 3.066666666666667e-05, "loss": 0.0346, "num_tokens": 557456.0, "reward": 1.5224132537841797, "reward_std": 1.171297311782837, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.0333574041724205, "rewards/belief_accuracy/std": 0.13328517973423004, "rewards/env_reward/mean": 1.060823678970337, "rewards/env_reward/std": 0.900728166103363, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 26.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.113, "frac_reward_zero_std": 0.0, "grad_norm": 2.398747205734253, "kl": 0.5030911080539227, "learning_rate": 3.055555555555556e-05, "loss": 0.0201, "num_tokens": 559973.0, "reward": -0.10044729709625244, "reward_std": 0.14265108108520508, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.11636848002672195, "rewards/env_reward/std": 0.09510072320699692, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.1135, "frac_reward_zero_std": 0.0, "grad_norm": 3.1962196826934814, "kl": 0.7515930682420731, "learning_rate": 3.044444444444445e-05, "loss": 0.0301, "num_tokens": 562501.0, "reward": 0.33509939908981323, "reward_std": 0.4751393795013428, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.406732976436615, "rewards/env_reward/std": 0.3167595863342285, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.114, "frac_reward_zero_std": 0.0, "grad_norm": 3.724390745162964, "kl": 1.3262446075677872, "learning_rate": 3.0333333333333337e-05, "loss": 0.053, "num_tokens": 565018.0, "reward": -2.383183717727661, "reward_std": 2.9639039039611816, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.3471225500106812, "rewards/env_reward/std": 1.9085785150527954, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1145, "frac_reward_zero_std": 0.0, "grad_norm": 3.0508358478546143, "kl": 0.805017001926899, "learning_rate": 3.0222222222222225e-05, "loss": 0.0322, "num_tokens": 567513.0, "reward": -1.0678391456604004, "reward_std": 2.5887842178344727, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.4993927776813507, "rewards/env_reward/std": 1.6675386428833008, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.115, "frac_reward_zero_std": 0.0, "grad_norm": 2.8914108276367188, "kl": 1.2341727763414383, "learning_rate": 3.0111111111111113e-05, "loss": 0.0494, "num_tokens": 570012.0, "reward": -1.0324312448501587, "reward_std": 0.03943846374750137, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5049540996551514, "rewards/env_reward/std": 0.026292279362678528, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1155, "frac_reward_zero_std": 0.0, "grad_norm": 3.649041175842285, "kl": 0.9338645786046982, "learning_rate": 3e-05, "loss": 0.0374, "num_tokens": 572521.0, "reward": -2.383704423904419, "reward_std": 2.9854514598846436, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.347469687461853, "rewards/env_reward/std": 1.9234607219696045, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.116, "frac_reward_zero_std": 0.0, "grad_norm": 5.132113933563232, "kl": 1.6651656776666641, "learning_rate": 2.988888888888889e-05, "loss": 0.0666, "num_tokens": 575028.0, "reward": -0.17734336853027344, "reward_std": 0.43010595440864563, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": 0.09427107125520706, "rewards/env_reward/std": 0.236448734998703, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1165, "frac_reward_zero_std": 0.0, "grad_norm": 2.701383590698242, "kl": 1.4364068657159805, "learning_rate": 2.9777777777777777e-05, "loss": 0.0575, "num_tokens": 577518.0, "reward": 0.2805197834968567, "reward_std": 0.16961893439292908, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3703465461730957, "rewards/env_reward/std": 0.11307929456233978, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.117, "frac_reward_zero_std": 0.0, "grad_norm": 4.781160354614258, "kl": 0.3255625441670418, "learning_rate": 2.9666666666666672e-05, "loss": 0.013, "num_tokens": 579950.0, "reward": 0.2083221822977066, "reward_std": 0.367115318775177, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.15008686482906342, "rewards/belief_accuracy/std": 0.12237177044153214, "rewards/env_reward/mean": 0.40572187304496765, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.1175, "frac_reward_zero_std": 0.0, "grad_norm": 3.328470468521118, "kl": 1.0922381281852722, "learning_rate": 2.955555555555556e-05, "loss": 0.0437, "num_tokens": 582468.0, "reward": 0.41766709089279175, "reward_std": 0.20472979545593262, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.46177807450294495, "rewards/env_reward/std": 0.13648654520511627, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.118, "frac_reward_zero_std": 0.0, "grad_norm": 3.3024332523345947, "kl": 1.729993849992752, "learning_rate": 2.9444444444444448e-05, "loss": 0.0692, "num_tokens": 584951.0, "reward": 0.5654071569442749, "reward_std": 0.20379649102687836, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.13415177166461945, "rewards/belief_accuracy/std": 0.06830354034900665, "rewards/env_reward/mean": 0.6244083642959595, "rewards/env_reward/std": 0.07622048258781433, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1185, "frac_reward_zero_std": 0.0, "grad_norm": 3.8457822799682617, "kl": 2.1087397560477257, "learning_rate": 2.9333333333333336e-05, "loss": 0.0843, "num_tokens": 587433.0, "reward": 1.2190449237823486, "reward_std": 0.21189068257808685, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9960300326347351, "rewards/env_reward/std": 0.1412605196237564, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.119, "frac_reward_zero_std": 1.0, "grad_norm": 0.029239589348435402, "kl": 0.5205878019332886, "learning_rate": 2.9222222222222224e-05, "loss": 0.0208, "num_tokens": 589641.0, "reward": 0.6929494738578796, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.4286329746246338, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1195, "frac_reward_zero_std": 0.0, "grad_norm": 5.953148365020752, "kl": 1.5010789930820465, "learning_rate": 2.9111111111111112e-05, "loss": 0.06, "num_tokens": 592116.0, "reward": -0.19302129745483398, "reward_std": 0.11821135133504868, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.05465248227119446, "rewards/env_reward/std": 0.07880757749080658, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.12, "frac_reward_zero_std": 1.0, "grad_norm": 0.028769580647349358, "kl": 0.5208476185798645, "learning_rate": 2.9e-05, "loss": 0.0208, "num_tokens": 594324.0, "reward": 0.7799785137176514, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.48665234446525574, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1205, "frac_reward_zero_std": 0.0, "grad_norm": 3.084097146987915, "kl": 1.6838389933109283, "learning_rate": 2.8888888888888888e-05, "loss": 0.0674, "num_tokens": 596812.0, "reward": -0.08287781476974487, "reward_std": 3.244748115539551, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.15724816918373108, "rewards/env_reward/std": 2.104832172393799, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 12.333333969116211, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.121, "frac_reward_zero_std": 0.0, "grad_norm": 2.273799180984497, "kl": 1.8537000715732574, "learning_rate": 2.877777777777778e-05, "loss": 0.0741, "num_tokens": 599281.0, "reward": 0.5991692543029785, "reward_std": 0.3846488893032074, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5827795267105103, "rewards/env_reward/std": 0.25643259286880493, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 25.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1215, "frac_reward_zero_std": 0.0, "grad_norm": 6.259014129638672, "kl": 1.056531861424446, "learning_rate": 2.8666666666666668e-05, "loss": 0.0423, "num_tokens": 601802.0, "reward": 1.1337945461273193, "reward_std": 0.15877185761928558, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": 0.9683631062507629, "rewards/env_reward/std": 0.1437581330537796, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.122, "frac_reward_zero_std": 0.0, "grad_norm": 3.7003579139709473, "kl": 0.6380213499069214, "learning_rate": 2.855555555555556e-05, "loss": 0.0255, "num_tokens": 604330.0, "reward": 0.300573468208313, "reward_std": 0.29817959666252136, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3837156891822815, "rewards/env_reward/std": 0.1987864077091217, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1225, "frac_reward_zero_std": 0.0, "grad_norm": 6.614965438842773, "kl": 1.0555044412612915, "learning_rate": 2.8444444444444447e-05, "loss": 0.0422, "num_tokens": 606840.0, "reward": 0.2127276510000229, "reward_std": 0.07096138596534729, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.32515180110931396, "rewards/env_reward/std": 0.04730759561061859, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.123, "frac_reward_zero_std": 0.0, "grad_norm": 4.199899673461914, "kl": 1.0866071283817291, "learning_rate": 2.8333333333333335e-05, "loss": 0.0435, "num_tokens": 609314.0, "reward": 0.6166397929191589, "reward_std": 0.013659524731338024, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5944265127182007, "rewards/env_reward/std": 0.009106338024139404, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 17.33333396911621, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1235, "frac_reward_zero_std": 1.0, "grad_norm": 0.5806828737258911, "kl": 2.1766858994960785, "learning_rate": 2.8222222222222223e-05, "loss": 0.0871, "num_tokens": 611798.0, "reward": -0.33692148327827454, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.04128097742795944, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.124, "frac_reward_zero_std": 0.0, "grad_norm": 2.1608145236968994, "kl": 0.6925233453512192, "learning_rate": 2.811111111111111e-05, "loss": 0.0277, "num_tokens": 614279.0, "reward": -0.17361339926719666, "reward_std": 0.17766423523426056, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.06759107857942581, "rewards/env_reward/std": 0.1184428334236145, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1245, "frac_reward_zero_std": 0.0, "grad_norm": 4.571046352386475, "kl": 1.0061021000146866, "learning_rate": 2.8000000000000003e-05, "loss": 0.0402, "num_tokens": 616713.0, "reward": 0.13527683913707733, "reward_std": 0.11952438950538635, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.1666666716337204, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.39018458127975464, "rewards/env_reward/std": 0.07968293130397797, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.125, "frac_reward_zero_std": 0.0, "grad_norm": 8.622580528259277, "kl": 3.023313194513321, "learning_rate": 2.788888888888889e-05, "loss": 0.1209, "num_tokens": 619160.0, "reward": 1.0675362348556519, "reward_std": 0.21132755279541016, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.8950241804122925, "rewards/env_reward/std": 0.14088504016399384, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1255, "frac_reward_zero_std": 0.0, "grad_norm": 3.349266767501831, "kl": 1.0463040620088577, "learning_rate": 2.777777777777778e-05, "loss": 0.0419, "num_tokens": 621668.0, "reward": 0.7296957969665527, "reward_std": 0.5778632760047913, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.6697972416877747, "rewards/env_reward/std": 0.385242223739624, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.126, "frac_reward_zero_std": 0.0, "grad_norm": 2.983954668045044, "kl": 1.3255593031644821, "learning_rate": 2.7666666666666667e-05, "loss": 0.053, "num_tokens": 624180.0, "reward": 0.7902753949165344, "reward_std": 0.17459960281848907, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.7101836204528809, "rewards/env_reward/std": 0.11639970541000366, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 15.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1265, "frac_reward_zero_std": 0.0, "grad_norm": 4.1040568351745605, "kl": 1.1374231353402138, "learning_rate": 2.7555555555555555e-05, "loss": 0.0455, "num_tokens": 626675.0, "reward": 1.2806947231292725, "reward_std": 0.3734249770641327, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.0371298789978027, "rewards/env_reward/std": 0.24895000457763672, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.127, "frac_reward_zero_std": 0.0, "grad_norm": 5.15152645111084, "kl": 1.8364887535572052, "learning_rate": 2.7444444444444443e-05, "loss": 0.0735, "num_tokens": 629171.0, "reward": -1.082712173461914, "reward_std": 2.5781917572021484, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5093082189559937, "rewards/env_reward/std": 1.660461187362671, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 23.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1275, "frac_reward_zero_std": 1.0, "grad_norm": 0.1334175318479538, "kl": 1.0996833890676498, "learning_rate": 2.733333333333333e-05, "loss": 0.044, "num_tokens": 631672.0, "reward": 1.756896734237671, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.354597806930542, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.128, "frac_reward_zero_std": 0.0, "grad_norm": 2.445394515991211, "kl": 1.3922849670052528, "learning_rate": 2.7222222222222223e-05, "loss": 0.0557, "num_tokens": 634146.0, "reward": 0.24904996156692505, "reward_std": 1.0076991319656372, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3493666648864746, "rewards/env_reward/std": 0.6717994809150696, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1285, "frac_reward_zero_std": 0.0, "grad_norm": 5.241879463195801, "kl": 1.8242901861667633, "learning_rate": 2.7111111111111114e-05, "loss": 0.073, "num_tokens": 636602.0, "reward": -0.19289076328277588, "reward_std": 3.193817377090454, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.08390611410140991, "rewards/env_reward/std": 2.0712993144989014, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.129, "frac_reward_zero_std": 0.0, "grad_norm": 6.70985221862793, "kl": 1.6169143170118332, "learning_rate": 2.7000000000000002e-05, "loss": 0.0647, "num_tokens": 639078.0, "reward": 0.679303765296936, "reward_std": 0.7435195446014404, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.6362025737762451, "rewards/env_reward/std": 0.4956797957420349, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 19.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1295, "frac_reward_zero_std": 0.0, "grad_norm": 2.5632526874542236, "kl": 1.0557297468185425, "learning_rate": 2.688888888888889e-05, "loss": 0.0422, "num_tokens": 641567.0, "reward": -0.45305585861206055, "reward_std": 3.029899835586548, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.08953723311424255, "rewards/env_reward/std": 1.962233066558838, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.13, "frac_reward_zero_std": 0.0, "grad_norm": 3.4922609329223633, "kl": 1.2477368414402008, "learning_rate": 2.677777777777778e-05, "loss": 0.0499, "num_tokens": 644066.0, "reward": -2.3447818756103516, "reward_std": 3.0328142642974854, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.09054364264011383, "rewards/belief_accuracy/std": 0.018912728875875473, "rewards/env_reward/mean": -1.3446006774902344, "rewards/env_reward/std": 1.92401123046875, "rewards/format_valid/mean": -0.625, "rewards/format_valid/std": 1.6007810831069946, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1305, "frac_reward_zero_std": 0.0, "grad_norm": 2.4958338737487793, "kl": 1.4933066070079803, "learning_rate": 2.6666666666666667e-05, "loss": 0.0597, "num_tokens": 646549.0, "reward": 1.619019865989685, "reward_std": 0.47918403148651123, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.08888889104127884, "rewards/belief_accuracy/std": 0.02222222276031971, "rewards/env_reward/mean": 1.2362910509109497, "rewards/env_reward/std": 0.28824105858802795, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 18.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.131, "frac_reward_zero_std": 0.0, "grad_norm": 2.8226537704467773, "kl": 1.6106074303388596, "learning_rate": 2.6555555555555555e-05, "loss": 0.0644, "num_tokens": 649035.0, "reward": 0.7369977235794067, "reward_std": 0.2108081877231598, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.6746652126312256, "rewards/env_reward/std": 0.14053881168365479, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1315, "frac_reward_zero_std": 0.0, "grad_norm": 4.192112922668457, "kl": 2.2760011553764343, "learning_rate": 2.6444444444444443e-05, "loss": 0.091, "num_tokens": 651502.0, "reward": 0.622403085231781, "reward_std": 0.43449920415878296, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5982687473297119, "rewards/env_reward/std": 0.28966614603996277, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.132, "frac_reward_zero_std": 0.0, "grad_norm": 4.731379985809326, "kl": 1.393263503909111, "learning_rate": 2.633333333333333e-05, "loss": 0.0557, "num_tokens": 653967.0, "reward": 0.10616789758205414, "reward_std": 0.26564618945121765, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.2541119456291199, "rewards/env_reward/std": 0.17709745466709137, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 20.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1325, "frac_reward_zero_std": 0.0, "grad_norm": 4.605635166168213, "kl": 1.3393024802207947, "learning_rate": 2.6222222222222226e-05, "loss": 0.0536, "num_tokens": 656472.0, "reward": -1.1023221015930176, "reward_std": 2.5652267932891846, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": -0.49321478605270386, "rewards/env_reward/std": 1.6726853847503662, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.133, "frac_reward_zero_std": 0.0, "grad_norm": 6.43408727645874, "kl": 0.9438246488571167, "learning_rate": 2.6111111111111114e-05, "loss": 0.0378, "num_tokens": 658980.0, "reward": -0.46058040857315063, "reward_std": 2.992946147918701, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.09455358982086182, "rewards/env_reward/std": 1.9369643926620483, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 17.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1335, "frac_reward_zero_std": 0.0, "grad_norm": 5.037111759185791, "kl": 1.522897057235241, "learning_rate": 2.6000000000000002e-05, "loss": 0.0609, "num_tokens": 661478.0, "reward": -0.8440333604812622, "reward_std": 2.7461934089660645, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.3501889407634735, "rewards/env_reward/std": 1.7726572751998901, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.134, "frac_reward_zero_std": 0.0, "grad_norm": 3.4638547897338867, "kl": 1.6383226662874222, "learning_rate": 2.588888888888889e-05, "loss": 0.0655, "num_tokens": 663965.0, "reward": -0.6360792517662048, "reward_std": 0.01327502727508545, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.2407195121049881, "rewards/env_reward/std": 0.008850008249282837, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 20.666667938232422, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1345, "frac_reward_zero_std": 0.0, "grad_norm": 2.5850181579589844, "kl": 1.492052584886551, "learning_rate": 2.5777777777777778e-05, "loss": 0.0597, "num_tokens": 666459.0, "reward": -0.4809204339981079, "reward_std": 2.983957052230835, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.10811367630958557, "rewards/env_reward/std": 1.9310635328292847, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.135, "frac_reward_zero_std": 1.0, "grad_norm": 2.2055325508117676, "kl": 2.1733334064483643, "learning_rate": 2.5666666666666666e-05, "loss": 0.0869, "num_tokens": 668940.0, "reward": 0.015844523906707764, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.1938963532447815, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1355, "frac_reward_zero_std": 0.0, "grad_norm": 5.974431991577148, "kl": 1.7352482229471207, "learning_rate": 2.5555555555555554e-05, "loss": 0.0694, "num_tokens": 671151.0, "reward": 1.4853503704071045, "reward_std": 0.053439658135175705, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9569003582000732, "rewards/env_reward/std": 0.03562644124031067, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.136, "frac_reward_zero_std": 0.0, "grad_norm": 7.03952169418335, "kl": 0.8288602530956268, "learning_rate": 2.5444444444444442e-05, "loss": 0.0332, "num_tokens": 673660.0, "reward": -0.24595004320144653, "reward_std": 3.136033058166504, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.04853332042694092, "rewards/env_reward/std": 2.032355546951294, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1365, "frac_reward_zero_std": 0.0, "grad_norm": 2.225179672241211, "kl": 1.4488344937562943, "learning_rate": 2.5333333333333337e-05, "loss": 0.058, "num_tokens": 676143.0, "reward": 1.1019805669784546, "reward_std": 0.872570812702179, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.08888889104127884, "rewards/belief_accuracy/std": 0.02222222276031971, "rewards/env_reward/mean": 0.8915981650352478, "rewards/env_reward/std": 0.5619891881942749, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 16.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.137, "frac_reward_zero_std": 0.0, "grad_norm": 2.0150835514068604, "kl": 1.9037371575832367, "learning_rate": 2.5222222222222225e-05, "loss": 0.0761, "num_tokens": 678640.0, "reward": 0.020011983811855316, "reward_std": 0.2474244087934494, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.07959593832492828, "rewards/belief_accuracy/std": 0.04080813378095627, "rewards/env_reward/mean": 0.15169985592365265, "rewards/env_reward/std": 0.07499999552965164, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1375, "frac_reward_zero_std": 1.0, "grad_norm": 0.30538198351860046, "kl": 3.629801630973816, "learning_rate": 2.5111111111111113e-05, "loss": 0.1452, "num_tokens": 681080.0, "reward": 0.16994282603263855, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.29662856459617615, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 19.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.138, "frac_reward_zero_std": 1.0, "grad_norm": 0.29680636525154114, "kl": 1.799863338470459, "learning_rate": 2.5e-05, "loss": 0.072, "num_tokens": 683569.0, "reward": 0.978661835193634, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.8357745409011841, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1385, "frac_reward_zero_std": 0.0, "grad_norm": 3.594503402709961, "kl": 1.047157421708107, "learning_rate": 2.488888888888889e-05, "loss": 0.0419, "num_tokens": 686023.0, "reward": -0.408236563205719, "reward_std": 0.6185159087181091, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.14728286862373352, "rewards/belief_accuracy/std": 0.1718742847442627, "rewards/env_reward/mean": -0.010925263166427612, "rewards/env_reward/std": 0.3019493520259857, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 12.666666984558105, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.139, "frac_reward_zero_std": 0.0, "grad_norm": 4.271994113922119, "kl": 1.7188260853290558, "learning_rate": 2.477777777777778e-05, "loss": 0.0688, "num_tokens": 688493.0, "reward": -0.4882531464099884, "reward_std": 2.9744977951049805, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.11300215125083923, "rewards/env_reward/std": 1.9246653318405151, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1395, "frac_reward_zero_std": 1.0, "grad_norm": 8.145206451416016, "kl": 3.5787951350212097, "learning_rate": 2.466666666666667e-05, "loss": 0.1432, "num_tokens": 690961.0, "reward": -0.254070907831192, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.013952743262052536, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 17.666667938232422, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.14, "frac_reward_zero_std": 1.0, "grad_norm": 0.3156006634235382, "kl": 1.6092262268066406, "learning_rate": 2.4555555555555557e-05, "loss": 0.0644, "num_tokens": 693446.0, "reward": 1.0831952095031738, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9054635763168335, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 12.666666984558105, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1405, "frac_reward_zero_std": 0.0, "grad_norm": 3.0812392234802246, "kl": 2.202967807650566, "learning_rate": 2.4444444444444445e-05, "loss": 0.0881, "num_tokens": 695916.0, "reward": -0.5145425200462341, "reward_std": 0.07837500423192978, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.15969499945640564, "rewards/env_reward/std": 0.052250005304813385, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.141, "frac_reward_zero_std": 0.0, "grad_norm": 2.6371960639953613, "kl": 0.2116354387253523, "learning_rate": 2.4333333333333336e-05, "loss": 0.0085, "num_tokens": 698421.0, "reward": -0.10934996604919434, "reward_std": 0.24821718037128448, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.11043336987495422, "rewards/env_reward/std": 0.16547811031341553, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1415, "frac_reward_zero_std": 0.0, "grad_norm": 4.447051525115967, "kl": 1.6157226860523224, "learning_rate": 2.4222222222222224e-05, "loss": 0.0646, "num_tokens": 700884.0, "reward": 0.7572240829467773, "reward_std": 0.34561485052108765, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.6881493330001831, "rewards/env_reward/std": 0.2304098904132843, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 14.333333969116211, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.142, "frac_reward_zero_std": 0.0, "grad_norm": 3.0768420696258545, "kl": 1.9094894081354141, "learning_rate": 2.4111111111111113e-05, "loss": 0.0764, "num_tokens": 703359.0, "reward": -0.14820998907089233, "reward_std": 3.201193332672119, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.11369338631629944, "rewards/env_reward/std": 2.0757956504821777, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 25.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1425, "frac_reward_zero_std": 0.0, "grad_norm": 3.422678232192993, "kl": 1.0840960815548897, "learning_rate": 2.4e-05, "loss": 0.0434, "num_tokens": 705880.0, "reward": -1.8789894580841064, "reward_std": 3.5502729415893555, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.0109930038452148, "rewards/env_reward/std": 2.2995729446411133, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 15.333333969116211, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.143, "frac_reward_zero_std": 1.0, "grad_norm": 0.17799264192581177, "kl": 1.9597035348415375, "learning_rate": 2.3888888888888892e-05, "loss": 0.0784, "num_tokens": 708358.0, "reward": 0.1835106611251831, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3056737780570984, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.1435, "frac_reward_zero_std": 1.0, "grad_norm": 0.08594390004873276, "kl": 1.2184199467301369, "learning_rate": 2.377777777777778e-05, "loss": 0.0487, "num_tokens": 710838.0, "reward": 0.28326112031936646, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.37217411398887634, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.144, "frac_reward_zero_std": 0.0, "grad_norm": 4.5200910568237305, "kl": 1.4814732670783997, "learning_rate": 2.3666666666666668e-05, "loss": 0.0593, "num_tokens": 713286.0, "reward": -1.652359962463379, "reward_std": 2.199069023132324, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.889073371887207, "rewards/env_reward/std": 1.4077305793762207, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1445, "frac_reward_zero_std": 0.0, "grad_norm": 3.1203649044036865, "kl": 1.7035606056451797, "learning_rate": 2.3555555555555556e-05, "loss": 0.0681, "num_tokens": 715764.0, "reward": -0.1590951681137085, "reward_std": 0.13472223281860352, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.07962962985038757, "rewards/belief_accuracy/std": 0.04074074327945709, "rewards/env_reward/mean": 0.03236249089241028, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 21.666667938232422, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.145, "frac_reward_zero_std": 0.0, "grad_norm": 4.187851905822754, "kl": 1.007147029042244, "learning_rate": 2.3444444444444448e-05, "loss": 0.0403, "num_tokens": 718261.0, "reward": -0.09939317405223846, "reward_std": 0.23847907781600952, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.1170712262392044, "rewards/env_reward/std": 0.15898606181144714, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.1455, "frac_reward_zero_std": 0.0, "grad_norm": 3.5976037979125977, "kl": 1.7104326635599136, "learning_rate": 2.3333333333333336e-05, "loss": 0.0684, "num_tokens": 720700.0, "reward": 1.4686640501022339, "reward_std": 0.1929871290922165, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.162442684173584, "rewards/env_reward/std": 0.12865811586380005, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 12.333333969116211, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.146, "frac_reward_zero_std": 1.0, "grad_norm": 0.35819676518440247, "kl": 1.510893777012825, "learning_rate": 2.3222222222222224e-05, "loss": 0.0604, "num_tokens": 723169.0, "reward": -0.17388460040092468, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.06741027534008026, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 17.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1465, "frac_reward_zero_std": 0.0, "grad_norm": 4.2439751625061035, "kl": 0.9592940956354141, "learning_rate": 2.3111111111111112e-05, "loss": 0.0384, "num_tokens": 725667.0, "reward": 1.141730546951294, "reward_std": 0.49770018458366394, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9444870352745056, "rewards/env_reward/std": 0.3318001627922058, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 12.666666984558105, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.147, "frac_reward_zero_std": 1.0, "grad_norm": 0.11825627088546753, "kl": 1.7975642383098602, "learning_rate": 2.3000000000000003e-05, "loss": 0.0719, "num_tokens": 728137.0, "reward": 0.03742995858192444, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.20828664302825928, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.1475, "frac_reward_zero_std": 0.0, "grad_norm": 2.948641777038574, "kl": 1.2299351058900356, "learning_rate": 2.288888888888889e-05, "loss": 0.0492, "num_tokens": 730600.0, "reward": 0.7943815588951111, "reward_std": 0.04808274284005165, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.7129210233688354, "rewards/env_reward/std": 0.03205517679452896, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.148, "frac_reward_zero_std": 0.0, "grad_norm": 1.8954527378082275, "kl": 1.4944463968276978, "learning_rate": 2.277777777777778e-05, "loss": 0.0598, "num_tokens": 733057.0, "reward": -1.1060881614685059, "reward_std": 2.5627288818359375, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5248920917510986, "rewards/env_reward/std": 1.6501553058624268, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 10.333333969116211, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1485, "frac_reward_zero_std": 0.0, "grad_norm": 2.3740248680114746, "kl": 1.4237545728683472, "learning_rate": 2.2666666666666668e-05, "loss": 0.057, "num_tokens": 735520.0, "reward": 1.2797008752822876, "reward_std": 0.22939470410346985, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.0364673137664795, "rewards/env_reward/std": 0.15292981266975403, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 20.666667938232422, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.149, "frac_reward_zero_std": 0.0, "grad_norm": 3.193483829498291, "kl": 1.3601939976215363, "learning_rate": 2.255555555555556e-05, "loss": 0.0544, "num_tokens": 738014.0, "reward": 0.5479322671890259, "reward_std": 0.025000015273690224, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": 0.577788233757019, "rewards/env_reward/std": 0.07499998807907104, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1495, "frac_reward_zero_std": 0.0, "grad_norm": 3.508397102355957, "kl": 2.330005407333374, "learning_rate": 2.2444444444444447e-05, "loss": 0.0932, "num_tokens": 740476.0, "reward": 0.6145496368408203, "reward_std": 0.276214063167572, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5930330753326416, "rewards/env_reward/std": 0.18414270877838135, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 19.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.15, "frac_reward_zero_std": 1.0, "grad_norm": 0.23656092584133148, "kl": 1.6847761273384094, "learning_rate": 2.2333333333333335e-05, "loss": 0.0674, "num_tokens": 742965.0, "reward": -0.4485160708427429, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.11567738652229309, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1505, "frac_reward_zero_std": 0.0, "grad_norm": 2.4569952487945557, "kl": 2.1653302907943726, "learning_rate": 2.2222222222222223e-05, "loss": 0.0866, "num_tokens": 745399.0, "reward": 0.7054736614227295, "reward_std": 0.29305312037467957, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.29629629850387573, "rewards/belief_accuracy/std": 0.03703702986240387, "rewards/env_reward/mean": 1.0295751094818115, "rewards/env_reward/std": 0.2694428265094757, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 16.666667938232422, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.151, "frac_reward_zero_std": 1.0, "grad_norm": 2.0652999877929688, "kl": 2.023993544280529, "learning_rate": 2.211111111111111e-05, "loss": 0.081, "num_tokens": 747881.0, "reward": 0.03393635153770447, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.2059575766324997, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1515, "frac_reward_zero_std": 1.0, "grad_norm": 0.17844082415103912, "kl": 1.3347703516483307, "learning_rate": 2.2000000000000003e-05, "loss": 0.0534, "num_tokens": 750363.0, "reward": -0.07117094099521637, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.1358860433101654, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.152, "frac_reward_zero_std": 0.0, "grad_norm": 4.173818111419678, "kl": 2.3482457399368286, "learning_rate": 2.188888888888889e-05, "loss": 0.0939, "num_tokens": 752819.0, "reward": 0.19658201932907104, "reward_std": 0.014906898140907288, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3143880367279053, "rewards/env_reward/std": 0.009937942028045654, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1525, "frac_reward_zero_std": 0.0, "grad_norm": 3.2245287895202637, "kl": 0.9445049874484539, "learning_rate": 2.177777777777778e-05, "loss": 0.0378, "num_tokens": 755308.0, "reward": -0.8592178821563721, "reward_std": 2.7382078170776367, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.13583332300186157, "rewards/belief_accuracy/std": 0.059961408376693726, "rewards/env_reward/mean": -0.3011452257633209, "rewards/env_reward/std": 1.803206205368042, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.153, "frac_reward_zero_std": 1.0, "grad_norm": 0.128830686211586, "kl": 1.6012963205575943, "learning_rate": 2.1666666666666667e-05, "loss": 0.0641, "num_tokens": 757778.0, "reward": 2.284616231918335, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.7064106464385986, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 18.666667938232422, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1535, "frac_reward_zero_std": 0.0, "grad_norm": 2.645291805267334, "kl": 1.772395834326744, "learning_rate": 2.1555555555555555e-05, "loss": 0.0709, "num_tokens": 760266.0, "reward": 0.8521826267242432, "reward_std": 0.31403204798698425, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.04974466934800148, "rewards/belief_accuracy/std": 0.10051066428422928, "rewards/env_reward/mean": 0.6467777490615845, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.154, "frac_reward_zero_std": 1.0, "grad_norm": 0.47690078616142273, "kl": 2.9113578498363495, "learning_rate": 2.1444444444444443e-05, "loss": 0.1165, "num_tokens": 762719.0, "reward": 1.4255595207214355, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.1337064504623413, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1545, "frac_reward_zero_std": 0.0, "grad_norm": 4.587830066680908, "kl": 2.2061780989170074, "learning_rate": 2.1333333333333335e-05, "loss": 0.0882, "num_tokens": 765179.0, "reward": -0.2726000249385834, "reward_std": 0.008999993093311787, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.001600000075995922, "rewards/env_reward/std": 0.005999999586492777, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.155, "frac_reward_zero_std": 0.0, "grad_norm": 3.092574119567871, "kl": 1.9001564681529999, "learning_rate": 2.1222222222222223e-05, "loss": 0.076, "num_tokens": 767642.0, "reward": 1.1732711791992188, "reward_std": 0.32429030537605286, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.04803495109081268, "rewards/belief_accuracy/std": 0.10393010079860687, "rewards/env_reward/mean": 0.8574174642562866, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1555, "frac_reward_zero_std": 1.0, "grad_norm": 0.12916934490203857, "kl": 1.6546337455511093, "learning_rate": 2.111111111111111e-05, "loss": 0.0662, "num_tokens": 770116.0, "reward": 1.206794023513794, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9878627061843872, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 18.33333396911621, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.156, "frac_reward_zero_std": 0.0, "grad_norm": 1.9729405641555786, "kl": 1.226298600435257, "learning_rate": 2.1e-05, "loss": 0.0491, "num_tokens": 772603.0, "reward": -0.8138376474380493, "reward_std": 2.757441520690918, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.3300584852695465, "rewards/env_reward/std": 1.7799609899520874, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1565, "frac_reward_zero_std": 0.0, "grad_norm": 3.1958553791046143, "kl": 1.9549526572227478, "learning_rate": 2.088888888888889e-05, "loss": 0.0782, "num_tokens": 775051.0, "reward": -0.8436893820762634, "reward_std": 0.03750000521540642, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10833333432674408, "rewards/belief_accuracy/std": 0.01666666939854622, "rewards/env_reward/mean": -0.36662623286247253, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.157, "frac_reward_zero_std": 0.0, "grad_norm": 4.083063125610352, "kl": 1.7084714621305466, "learning_rate": 2.077777777777778e-05, "loss": 0.0683, "num_tokens": 777519.0, "reward": -0.9321072697639465, "reward_std": 2.689814329147339, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.0886182188987732, "rewards/belief_accuracy/std": 0.022763576358556747, "rewards/env_reward/mean": -0.4358351230621338, "rewards/env_reward/std": 1.7134547233581543, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.1575, "frac_reward_zero_std": 0.0, "grad_norm": 2.9729490280151367, "kl": 1.67718306183815, "learning_rate": 2.0666666666666666e-05, "loss": 0.0671, "num_tokens": 780008.0, "reward": 0.4863058924674988, "reward_std": 0.024692803621292114, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.09369880706071854, "rewards/belief_accuracy/std": 0.012602388858795166, "rewards/env_reward/mean": 0.4907682240009308, "rewards/env_reward/std": 0.050000011920928955, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 9.333333969116211, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.158, "frac_reward_zero_std": 1.0, "grad_norm": 0.2093023657798767, "kl": 1.742059737443924, "learning_rate": 2.0555555555555555e-05, "loss": 0.0697, "num_tokens": 782468.0, "reward": 0.46483826637268066, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.4932255446910858, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1585, "frac_reward_zero_std": 0.0, "grad_norm": 4.33806037902832, "kl": 1.9989095479249954, "learning_rate": 2.0444444444444446e-05, "loss": 0.08, "num_tokens": 784932.0, "reward": 1.7721874713897705, "reward_std": 0.821945071220398, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.08935601264238358, "rewards/belief_accuracy/std": 0.021287977695465088, "rewards/env_reward/mean": 1.3393369913101196, "rewards/env_reward/std": 0.5330812931060791, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.159, "frac_reward_zero_std": 0.0, "grad_norm": 4.552093505859375, "kl": 1.7921818047761917, "learning_rate": 2.0333333333333334e-05, "loss": 0.0717, "num_tokens": 787424.0, "reward": 0.2771155834197998, "reward_std": 0.052816081792116165, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3680770993232727, "rewards/env_reward/std": 0.03521072119474411, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1595, "frac_reward_zero_std": 0.0, "grad_norm": 5.766949653625488, "kl": 1.8316420912742615, "learning_rate": 2.0222222222222222e-05, "loss": 0.0733, "num_tokens": 789905.0, "reward": 0.7448418140411377, "reward_std": 0.2648809254169464, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.125, "rewards/belief_accuracy/std": 0.05000000074505806, "rewards/env_reward/mean": 0.725727915763855, "rewards/env_reward/std": 0.21821647882461548, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.16, "frac_reward_zero_std": 0.0, "grad_norm": 3.683352470397949, "kl": 1.9505991637706757, "learning_rate": 2.011111111111111e-05, "loss": 0.078, "num_tokens": 792357.0, "reward": 2.3521366119384766, "reward_std": 0.05138897895812988, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.0981481522321701, "rewards/belief_accuracy/std": 0.0037037059664726257, "rewards/env_reward/mean": 1.7435539960861206, "rewards/env_reward/std": 0.050000011920928955, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1605, "frac_reward_zero_std": 0.0, "grad_norm": 3.7608256340026855, "kl": 1.294388547539711, "learning_rate": 2e-05, "loss": 0.0518, "num_tokens": 794851.0, "reward": -1.1710084676742554, "reward_std": 2.5193276405334473, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5681723356246948, "rewards/env_reward/std": 1.6212184429168701, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.161, "frac_reward_zero_std": 0.0, "grad_norm": 2.618898391723633, "kl": 1.0062730349600315, "learning_rate": 1.988888888888889e-05, "loss": 0.0403, "num_tokens": 797334.0, "reward": -1.3532192707061768, "reward_std": 2.3978536128997803, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6896461844444275, "rewards/env_reward/std": 1.5402358770370483, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 12.666666984558105, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1615, "frac_reward_zero_std": 0.0, "grad_norm": 2.3661885261535645, "kl": 1.7326427102088928, "learning_rate": 1.9777777777777778e-05, "loss": 0.0693, "num_tokens": 799804.0, "reward": -0.9932671785354614, "reward_std": 2.645886182785034, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.09086865186691284, "rewards/belief_accuracy/std": 0.018262699246406555, "rewards/env_reward/mean": -0.47210749983787537, "rewards/env_reward/std": 1.6948373317718506, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.162, "frac_reward_zero_std": 1.0, "grad_norm": 0.10320170223712921, "kl": 1.0313833132386208, "learning_rate": 1.9666666666666666e-05, "loss": 0.0413, "num_tokens": 802311.0, "reward": 0.14523997902870178, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.28015998005867004, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1625, "frac_reward_zero_std": 0.0, "grad_norm": 2.900913953781128, "kl": 1.2387384474277496, "learning_rate": 1.9555555555555557e-05, "loss": 0.0495, "num_tokens": 804799.0, "reward": -1.4717153310775757, "reward_std": 2.3200571537017822, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.08888889104127884, "rewards/belief_accuracy/std": 0.02222222276031971, "rewards/env_reward/mean": -0.7950325012207031, "rewards/env_reward/std": 1.4699784517288208, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.163, "frac_reward_zero_std": 0.0, "grad_norm": 1.6629624366760254, "kl": 1.2183350324630737, "learning_rate": 1.9444444444444445e-05, "loss": 0.0487, "num_tokens": 807276.0, "reward": 1.4414076805114746, "reward_std": 0.7849681973457336, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.03274526074528694, "rewards/belief_accuracy/std": 0.046666666865348816, "rewards/env_reward/mean": 0.8621145486831665, "rewards/env_reward/std": 0.5469719767570496, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1635, "frac_reward_zero_std": 0.0, "grad_norm": 3.7185583114624023, "kl": 0.9632565826177597, "learning_rate": 1.9333333333333333e-05, "loss": 0.0385, "num_tokens": 809790.0, "reward": 0.584887683391571, "reward_std": 0.04735124111175537, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11294145882129669, "rewards/belief_accuracy/std": 0.025882910937070847, "rewards/env_reward/mean": 0.5949747562408447, "rewards/env_reward/std": 0.07499998807907104, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.164, "frac_reward_zero_std": 1.0, "grad_norm": 0.11391498893499374, "kl": 1.3412380516529083, "learning_rate": 1.922222222222222e-05, "loss": 0.0536, "num_tokens": 812270.0, "reward": 0.08814990520477295, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.24209994077682495, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.1645, "frac_reward_zero_std": 0.0, "grad_norm": 1.71048104763031, "kl": 1.0797484368085861, "learning_rate": 1.9111111111111113e-05, "loss": 0.0432, "num_tokens": 814736.0, "reward": -0.007705964148044586, "reward_std": 0.3800565302371979, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.1781960427761078, "rewards/env_reward/std": 0.2533710300922394, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.165, "frac_reward_zero_std": 1.0, "grad_norm": 0.2313052862882614, "kl": 2.1413558423519135, "learning_rate": 1.9e-05, "loss": 0.0857, "num_tokens": 817183.0, "reward": 1.2785534858703613, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.0357023477554321, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 15.666666984558105, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1655, "frac_reward_zero_std": 1.0, "grad_norm": 1.0023722648620605, "kl": 1.8249645978212357, "learning_rate": 1.888888888888889e-05, "loss": 0.073, "num_tokens": 819662.0, "reward": 0.22990059852600098, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3366004228591919, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.166, "frac_reward_zero_std": 0.0, "grad_norm": 2.8686375617980957, "kl": 1.2005413547158241, "learning_rate": 1.8777777777777777e-05, "loss": 0.048, "num_tokens": 822139.0, "reward": 0.4151918888092041, "reward_std": 0.25368455052375793, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.19324073195457458, "rewards/belief_accuracy/std": 0.08456152677536011, "rewards/env_reward/mean": 0.6299427151679993, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1665, "frac_reward_zero_std": 0.0, "grad_norm": 3.5046756267547607, "kl": 1.418169043958187, "learning_rate": 1.866666666666667e-05, "loss": 0.0567, "num_tokens": 824628.0, "reward": -0.14963069558143616, "reward_std": 0.08749999105930328, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": 0.11274620145559311, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.167, "frac_reward_zero_std": 1.0, "grad_norm": 0.3764250874519348, "kl": 1.4826988205313683, "learning_rate": 1.8555555555555557e-05, "loss": 0.0593, "num_tokens": 827086.0, "reward": 0.7749629616737366, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.6999753713607788, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1675, "frac_reward_zero_std": 0.0, "grad_norm": 3.6749558448791504, "kl": 0.48872488737106323, "learning_rate": 1.8444444444444445e-05, "loss": 0.0195, "num_tokens": 829294.0, "reward": 0.8840881586074829, "reward_std": 0.5165320634841919, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5560587644577026, "rewards/env_reward/std": 0.3443547189235687, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.168, "frac_reward_zero_std": 0.0, "grad_norm": 2.3814547061920166, "kl": 1.7959783673286438, "learning_rate": 1.8333333333333333e-05, "loss": 0.0718, "num_tokens": 831767.0, "reward": 1.6337858438491821, "reward_std": 0.598229169845581, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.2725238800048828, "rewards/env_reward/std": 0.39881935715675354, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 20.33333396911621, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1685, "frac_reward_zero_std": 0.0, "grad_norm": 2.819786548614502, "kl": 1.5642708837985992, "learning_rate": 1.8222222222222224e-05, "loss": 0.0626, "num_tokens": 834260.0, "reward": 0.8285795450210571, "reward_std": 0.7248117923736572, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.7357197403907776, "rewards/env_reward/std": 0.48320794105529785, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.169, "frac_reward_zero_std": 0.0, "grad_norm": 3.874178647994995, "kl": 1.9785820245742798, "learning_rate": 1.8111111111111112e-05, "loss": 0.0791, "num_tokens": 836726.0, "reward": -1.273527979850769, "reward_std": 2.451591968536377, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.09240995347499847, "rewards/belief_accuracy/std": 0.015180099755525589, "rewards/env_reward/mean": -0.6558654308319092, "rewards/env_reward/std": 1.5627564191818237, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1695, "frac_reward_zero_std": 1.0, "grad_norm": 1.4243091344833374, "kl": 2.757372349500656, "learning_rate": 1.8e-05, "loss": 0.1103, "num_tokens": 839160.0, "reward": -0.0059850215911865234, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.1793433278799057, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.17, "frac_reward_zero_std": 1.0, "grad_norm": 0.5190337896347046, "kl": 1.0218081027269363, "learning_rate": 1.788888888888889e-05, "loss": 0.0409, "num_tokens": 841680.0, "reward": 0.3096563220024109, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3897709250450134, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 20.33333396911621, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1705, "frac_reward_zero_std": 0.0, "grad_norm": 3.4040679931640625, "kl": 1.460461974143982, "learning_rate": 1.777777777777778e-05, "loss": 0.0584, "num_tokens": 844173.0, "reward": 0.343553364276886, "reward_std": 0.12038552761077881, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11296296119689941, "rewards/belief_accuracy/std": 0.07037036865949631, "rewards/env_reward/mean": 0.4257948398590088, "rewards/env_reward/std": 0.08660253137350082, "rewards/format_valid/mean": 0.875, "rewards/format_valid/std": 0.25, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.171, "frac_reward_zero_std": 1.0, "grad_norm": 0.3424733579158783, "kl": 2.612625613808632, "learning_rate": 1.7666666666666668e-05, "loss": 0.1045, "num_tokens": 846605.0, "reward": 0.7122496962547302, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.6581664681434631, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 18.33333396911621, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1715, "frac_reward_zero_std": 0.0, "grad_norm": 4.17530632019043, "kl": 1.9749469459056854, "learning_rate": 1.7555555555555556e-05, "loss": 0.079, "num_tokens": 849092.0, "reward": 0.6472027897834778, "reward_std": 0.02499997615814209, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": 0.6439685821533203, "rewards/env_reward/std": 0.07499998807907104, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.172, "frac_reward_zero_std": 0.0, "grad_norm": 1.9005358219146729, "kl": 1.6942770928144455, "learning_rate": 1.7444444444444448e-05, "loss": 0.0678, "num_tokens": 851583.0, "reward": -0.1503353714942932, "reward_std": 3.1997761726379395, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.11227646470069885, "rewards/env_reward/std": 2.0748510360717773, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1725, "frac_reward_zero_std": 0.0, "grad_norm": 3.6135246753692627, "kl": 2.6989140063524246, "learning_rate": 1.7333333333333336e-05, "loss": 0.108, "num_tokens": 854040.0, "reward": 0.3334026038646698, "reward_std": 0.41579148173332214, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.08210506290197372, "rewards/belief_accuracy/std": 0.08616961538791656, "rewards/env_reward/mean": 0.3614785075187683, "rewards/env_reward/std": 0.10307764261960983, "rewards/format_valid/mean": 0.75, "rewards/format_valid/std": 0.28867512941360474, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.173, "frac_reward_zero_std": 1.0, "grad_norm": 0.07000944018363953, "kl": 1.5389113873243332, "learning_rate": 1.7222222222222224e-05, "loss": 0.0616, "num_tokens": 856490.0, "reward": -0.5777875781059265, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.20185838639736176, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 19.666667938232422, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1735, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494380682706833, "kl": 1.0276093482971191, "learning_rate": 1.7111111111111112e-05, "loss": 0.0411, "num_tokens": 858981.0, "reward": -0.03337675333023071, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.1610821634531021, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.174, "frac_reward_zero_std": 1.0, "grad_norm": 3.6167244911193848, "kl": 2.8303582668304443, "learning_rate": 1.7000000000000003e-05, "loss": 0.1132, "num_tokens": 861433.0, "reward": -0.3828308582305908, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.07188723236322403, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1745, "frac_reward_zero_std": 0.0, "grad_norm": 3.7456555366516113, "kl": 1.0419511049985886, "learning_rate": 1.688888888888889e-05, "loss": 0.0417, "num_tokens": 863906.0, "reward": -1.5166230201721191, "reward_std": 2.289083957672119, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.7985820770263672, "rewards/env_reward/std": 1.4677271842956543, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.175, "frac_reward_zero_std": 0.0, "grad_norm": 4.4072089195251465, "kl": 1.2833919078111649, "learning_rate": 1.677777777777778e-05, "loss": 0.0513, "num_tokens": 866366.0, "reward": -0.9303372502326965, "reward_std": 2.7289562225341797, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.40772485733032227, "rewards/env_reward/std": 1.762056589126587, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.1755, "frac_reward_zero_std": 0.0, "grad_norm": 5.789770603179932, "kl": 2.8316257670521736, "learning_rate": 1.6666666666666667e-05, "loss": 0.1133, "num_tokens": 868810.0, "reward": -1.0804004669189453, "reward_std": 2.5841238498687744, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5077669620513916, "rewards/env_reward/std": 1.6645185947418213, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.176, "frac_reward_zero_std": 0.0, "grad_norm": 4.470717430114746, "kl": 1.4938061088323593, "learning_rate": 1.655555555555556e-05, "loss": 0.0598, "num_tokens": 871261.0, "reward": 0.675422191619873, "reward_std": 0.3987497091293335, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.6336148381233215, "rewards/env_reward/std": 0.26583316922187805, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1765, "frac_reward_zero_std": 1.0, "grad_norm": 0.12403933703899384, "kl": 1.361063152551651, "learning_rate": 1.6444444444444447e-05, "loss": 0.0544, "num_tokens": 873721.0, "reward": 0.8822908997535706, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.7715272903442383, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.177, "frac_reward_zero_std": 0.0, "grad_norm": 1.7063394784927368, "kl": 0.3987312912940979, "learning_rate": 1.6333333333333335e-05, "loss": 0.0159, "num_tokens": 876153.0, "reward": 0.5504003167152405, "reward_std": 0.21252882480621338, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.09899382293224335, "rewards/belief_accuracy/std": 0.03703703358769417, "rewards/env_reward/mean": 0.5315878391265869, "rewards/env_reward/std": 0.06761179864406586, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 10.666666984558105, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.1775, "frac_reward_zero_std": 0.0, "grad_norm": 1.8553385734558105, "kl": 1.883504644036293, "learning_rate": 1.6222222222222223e-05, "loss": 0.0753, "num_tokens": 878617.0, "reward": 1.5724425315856934, "reward_std": 0.34154796600341797, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.06390867382287979, "rewards/belief_accuracy/std": 0.07218265533447266, "rewards/env_reward/mean": 1.1552791595458984, "rewards/env_reward/std": 0.07499998807907104, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 13.333333969116211, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.178, "frac_reward_zero_std": 1.0, "grad_norm": 12.448164939880371, "kl": 2.9188559651374817, "learning_rate": 1.6111111111111115e-05, "loss": 0.1168, "num_tokens": 881089.0, "reward": 0.1419695019721985, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.27797967195510864, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1785, "frac_reward_zero_std": 0.0, "grad_norm": 4.100508213043213, "kl": 1.9373711496591568, "learning_rate": 1.6000000000000003e-05, "loss": 0.0775, "num_tokens": 883555.0, "reward": 0.9330868721008301, "reward_std": 0.1425846964120865, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.07523864507675171, "rewards/belief_accuracy/std": 0.04427932947874069, "rewards/env_reward/mean": 0.747535228729248, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.75, "rewards/format_valid/std": 0.28867512941360474, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 18.666667938232422, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.179, "frac_reward_zero_std": 0.0, "grad_norm": 2.447638750076294, "kl": 1.3387613743543625, "learning_rate": 1.588888888888889e-05, "loss": 0.0536, "num_tokens": 886043.0, "reward": -0.9939883947372437, "reward_std": 2.637341022491455, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.450158953666687, "rewards/env_reward/std": 1.6998939514160156, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1795, "frac_reward_zero_std": 0.0, "grad_norm": 3.1080024242401123, "kl": 1.570095743983984, "learning_rate": 1.577777777777778e-05, "loss": 0.0628, "num_tokens": 888503.0, "reward": 0.31804150342941284, "reward_std": 0.24924513697624207, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.07387004047632217, "rewards/belief_accuracy/std": 0.05280756205320358, "rewards/env_reward/mean": 0.3264344036579132, "rewards/env_reward/std": 0.07499998807907104, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.18, "frac_reward_zero_std": 1.0, "grad_norm": 0.21039718389511108, "kl": 1.6601714193820953, "learning_rate": 1.5666666666666667e-05, "loss": 0.0664, "num_tokens": 890936.0, "reward": 1.1850183010101318, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9733456373214722, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 15.666666984558105, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1805, "frac_reward_zero_std": 0.0, "grad_norm": 4.855799674987793, "kl": 2.307783365249634, "learning_rate": 1.5555555555555555e-05, "loss": 0.0923, "num_tokens": 893415.0, "reward": -0.09686481952667236, "reward_std": 3.240438222885132, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10740740597248077, "rewards/belief_accuracy/std": 0.014814812690019608, "rewards/env_reward/mean": 0.1585715115070343, "rewards/env_reward/std": 2.109255075454712, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.181, "frac_reward_zero_std": 0.0, "grad_norm": 2.866522789001465, "kl": 1.5097930580377579, "learning_rate": 1.5444444444444446e-05, "loss": 0.0604, "num_tokens": 895624.0, "reward": 0.7403470277786255, "reward_std": 0.04096466302871704, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.46023130416870117, "rewards/env_reward/std": 0.027309775352478027, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1815, "frac_reward_zero_std": 0.0, "grad_norm": 3.6737067699432373, "kl": 2.508372038602829, "learning_rate": 1.5333333333333334e-05, "loss": 0.1003, "num_tokens": 898087.0, "reward": 0.41143205761909485, "reward_std": 0.1285679042339325, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.0962962955236435, "rewards/belief_accuracy/std": 0.06063224375247955, "rewards/env_reward/mean": 0.44188064336776733, "rewards/env_reward/std": 0.07500001788139343, "rewards/format_valid/mean": 0.75, "rewards/format_valid/std": 0.28867512941360474, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.182, "frac_reward_zero_std": 0.0, "grad_norm": 3.876965284347534, "kl": 1.120678260922432, "learning_rate": 1.5222222222222224e-05, "loss": 0.0448, "num_tokens": 900594.0, "reward": -0.9049590826034546, "reward_std": 2.696809768676758, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10109934210777283, "rewards/belief_accuracy/std": 0.002198692411184311, "rewards/env_reward/mean": -0.3927740752696991, "rewards/env_reward/std": 1.7381988763809204, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.1825, "frac_reward_zero_std": 1.0, "grad_norm": 0.1522875726222992, "kl": 2.180483788251877, "learning_rate": 1.5111111111111112e-05, "loss": 0.0872, "num_tokens": 903041.0, "reward": -0.1322389543056488, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.09517402946949005, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.183, "frac_reward_zero_std": 0.0, "grad_norm": 1.5857621431350708, "kl": 1.8570825457572937, "learning_rate": 1.5e-05, "loss": 0.0743, "num_tokens": 905267.0, "reward": 0.7447291612625122, "reward_std": 0.43247494101524353, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.46315279603004456, "rewards/env_reward/std": 0.28831663727760315, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 20.666667938232422, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1835, "frac_reward_zero_std": 0.0, "grad_norm": 3.6089088916778564, "kl": 1.4628183841705322, "learning_rate": 1.4888888888888888e-05, "loss": 0.0585, "num_tokens": 907761.0, "reward": -0.8686517477035522, "reward_std": 2.723708152770996, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.36660122871398926, "rewards/env_reward/std": 1.757534384727478, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 13.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.184, "frac_reward_zero_std": 0.0, "grad_norm": 3.5371828079223633, "kl": 1.56082084774971, "learning_rate": 1.477777777777778e-05, "loss": 0.0624, "num_tokens": 910251.0, "reward": -0.38402676582336426, "reward_std": 0.11692270636558533, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.07268451154232025, "rewards/env_reward/std": 0.07794848084449768, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1845, "frac_reward_zero_std": 0.0, "grad_norm": 3.959965944290161, "kl": 2.1547632068395615, "learning_rate": 1.4666666666666668e-05, "loss": 0.0862, "num_tokens": 912740.0, "reward": 2.330038070678711, "reward_std": 1.4076868295669556, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.736691951751709, "rewards/env_reward/std": 0.9384578466415405, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.185, "frac_reward_zero_std": 0.0, "grad_norm": 4.676491737365723, "kl": 1.9522491097450256, "learning_rate": 1.4555555555555556e-05, "loss": 0.0781, "num_tokens": 915217.0, "reward": 0.3270930051803589, "reward_std": 0.4210644066333771, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.1784050613641739, "rewards/belief_accuracy/std": 0.11000154912471771, "rewards/env_reward/mean": 0.5498721599578857, "rewards/env_reward/std": 0.2555084824562073, "rewards/format_valid/mean": 0.75, "rewards/format_valid/std": 0.28867512941360474, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1855, "frac_reward_zero_std": 0.0, "grad_norm": 3.2627854347229004, "kl": 1.2735976241528988, "learning_rate": 1.4444444444444444e-05, "loss": 0.0509, "num_tokens": 917679.0, "reward": -1.3109471797943115, "reward_std": 2.4330153465270996, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6614648103713989, "rewards/env_reward/std": 1.5638506412506104, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.186, "frac_reward_zero_std": 1.0, "grad_norm": 0.16428321599960327, "kl": 1.3368073627352715, "learning_rate": 1.4333333333333334e-05, "loss": 0.0535, "num_tokens": 920166.0, "reward": 1.3904471397399902, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.1102981567382812, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1865, "frac_reward_zero_std": 0.0, "grad_norm": 2.8244247436523438, "kl": 1.7505681961774826, "learning_rate": 1.4222222222222224e-05, "loss": 0.07, "num_tokens": 922599.0, "reward": -0.15878620743751526, "reward_std": 0.2896662950515747, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.21296297013759613, "rewards/belief_accuracy/std": 0.055555559694767, "rewards/env_reward/mean": 0.2867351174354553, "rewards/env_reward/std": 0.0973709374666214, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.187, "frac_reward_zero_std": 0.0, "grad_norm": 1.4690308570861816, "kl": 1.8018342107534409, "learning_rate": 1.4111111111111112e-05, "loss": 0.0721, "num_tokens": 925052.0, "reward": -0.061834536492824554, "reward_std": 0.3926420509815216, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.14211031794548035, "rewards/env_reward/std": 0.2617613673210144, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 15.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.1875, "frac_reward_zero_std": 1.0, "grad_norm": 0.06469111144542694, "kl": 1.3878820985555649, "learning_rate": 1.4000000000000001e-05, "loss": 0.0555, "num_tokens": 927529.0, "reward": 1.96260666847229, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.4917376041412354, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.188, "frac_reward_zero_std": 1.0, "grad_norm": 0.1889788955450058, "kl": 2.4346917867660522, "learning_rate": 1.388888888888889e-05, "loss": 0.0974, "num_tokens": 929976.0, "reward": -0.43010014295578003, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.1034000813961029, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 14.666666984558105, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1885, "frac_reward_zero_std": 0.0, "grad_norm": 3.5201666355133057, "kl": 1.3663449361920357, "learning_rate": 1.3777777777777778e-05, "loss": 0.0547, "num_tokens": 932452.0, "reward": 0.06843796372413635, "reward_std": 0.21172499656677246, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.2289586365222931, "rewards/env_reward/std": 0.14114999771118164, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.189, "frac_reward_zero_std": 1.0, "grad_norm": 0.14993655681610107, "kl": 0.6987491399049759, "learning_rate": 1.3666666666666666e-05, "loss": 0.0279, "num_tokens": 934974.0, "reward": 1.5713868141174316, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.2309246063232422, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 16.666667938232422, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1895, "frac_reward_zero_std": 0.0, "grad_norm": 2.221292018890381, "kl": 1.250510759651661, "learning_rate": 1.3555555555555557e-05, "loss": 0.05, "num_tokens": 937456.0, "reward": -1.592812180519104, "reward_std": 2.260319471359253, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.8493747711181641, "rewards/env_reward/std": 1.4491422176361084, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 12.333333969116211, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.19, "frac_reward_zero_std": 0.0, "grad_norm": 2.452324628829956, "kl": 1.871617242693901, "learning_rate": 1.3444444444444445e-05, "loss": 0.0749, "num_tokens": 939925.0, "reward": -1.566220998764038, "reward_std": 2.2558834552764893, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.1041666716337204, "rewards/belief_accuracy/std": 0.008333333767950535, "rewards/env_reward/mean": -0.8274806141853333, "rewards/env_reward/std": 1.4483462572097778, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 19.666667938232422, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1905, "frac_reward_zero_std": 0.0, "grad_norm": 2.3323750495910645, "kl": 1.2848467752337456, "learning_rate": 1.3333333333333333e-05, "loss": 0.0514, "num_tokens": 942416.0, "reward": -0.6763423681259155, "reward_std": 2.849104881286621, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.23839491605758667, "rewards/env_reward/std": 1.8410701751708984, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.191, "frac_reward_zero_std": 0.0, "grad_norm": 2.2046873569488525, "kl": 1.6044066846370697, "learning_rate": 1.3222222222222221e-05, "loss": 0.0642, "num_tokens": 944899.0, "reward": -0.8772720694541931, "reward_std": 2.7223305702209473, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": -0.3431813418865204, "rewards/env_reward/std": 1.7810261249542236, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 15.666666984558105, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1915, "frac_reward_zero_std": 0.0, "grad_norm": 2.325809955596924, "kl": 1.2760074064135551, "learning_rate": 1.3111111111111113e-05, "loss": 0.051, "num_tokens": 947378.0, "reward": 0.27768200635910034, "reward_std": 0.5863049030303955, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3684546947479248, "rewards/env_reward/std": 0.39086994528770447, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.192, "frac_reward_zero_std": 1.0, "grad_norm": 3.8383851051330566, "kl": 2.990824520587921, "learning_rate": 1.3000000000000001e-05, "loss": 0.1196, "num_tokens": 949820.0, "reward": -0.10776805877685547, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.1114879697561264, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.1925, "frac_reward_zero_std": 1.0, "grad_norm": 0.13448657095432281, "kl": 1.0713577568531036, "learning_rate": 1.2888888888888889e-05, "loss": 0.0429, "num_tokens": 952323.0, "reward": 1.157989263534546, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9553261399269104, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.193, "frac_reward_zero_std": 0.0, "grad_norm": 1.9450551271438599, "kl": 0.9243371933698654, "learning_rate": 1.2777777777777777e-05, "loss": 0.037, "num_tokens": 954816.0, "reward": -0.5924662947654724, "reward_std": 0.07494938373565674, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.2116442173719406, "rewards/env_reward/std": 0.049966249614953995, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 17.666667938232422, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1935, "frac_reward_zero_std": 1.0, "grad_norm": 0.15212330222129822, "kl": 1.260749876499176, "learning_rate": 1.2666666666666668e-05, "loss": 0.0504, "num_tokens": 957301.0, "reward": 1.0907130241394043, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9104753136634827, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.194, "frac_reward_zero_std": 0.0, "grad_norm": 3.1250762939453125, "kl": 1.1979729011654854, "learning_rate": 1.2555555555555557e-05, "loss": 0.0479, "num_tokens": 959769.0, "reward": -1.0788286924362183, "reward_std": 2.5807807445526123, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5067191123962402, "rewards/env_reward/std": 1.6621873378753662, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.1945, "frac_reward_zero_std": 1.0, "grad_norm": 5.245925426483154, "kl": 3.0835418105125427, "learning_rate": 1.2444444444444445e-05, "loss": 0.1233, "num_tokens": 962212.0, "reward": 0.7963976263999939, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.7142651081085205, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.195, "frac_reward_zero_std": 0.0, "grad_norm": 1.65029776096344, "kl": 0.22941425442695618, "learning_rate": 1.2333333333333334e-05, "loss": 0.0092, "num_tokens": 964644.0, "reward": -0.39757806062698364, "reward_std": 0.055555522441864014, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.28703704476356506, "rewards/belief_accuracy/std": 0.018518507480621338, "rewards/env_reward/mean": 0.27568870782852173, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1955, "frac_reward_zero_std": 0.0, "grad_norm": 2.8236050605773926, "kl": 0.451984953135252, "learning_rate": 1.2222222222222222e-05, "loss": 0.0181, "num_tokens": 967076.0, "reward": 0.8589808344841003, "reward_std": 0.166666641831398, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.1035659909248352, "rewards/belief_accuracy/std": 0.0555555522441864, "rewards/env_reward/mean": 0.7464525699615479, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.196, "frac_reward_zero_std": 0.0, "grad_norm": 3.8952887058258057, "kl": 1.2005406767129898, "learning_rate": 1.2111111111111112e-05, "loss": 0.048, "num_tokens": 969567.0, "reward": -0.6726483106613159, "reward_std": 2.8525443077087402, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.08888889104127884, "rewards/belief_accuracy/std": 0.02222222276031971, "rewards/env_reward/mean": -0.2623211145401001, "rewards/env_reward/std": 1.8251193761825562, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 16.666667938232422, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1965, "frac_reward_zero_std": 0.0, "grad_norm": 2.469162702560425, "kl": 1.197481319308281, "learning_rate": 1.2e-05, "loss": 0.0479, "num_tokens": 972049.0, "reward": 0.8975731134414673, "reward_std": 0.051388900727033615, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.09351852536201477, "rewards/belief_accuracy/std": 0.012962963432073593, "rewards/env_reward/mean": 0.7645858526229858, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.197, "frac_reward_zero_std": 0.0, "grad_norm": 2.2335853576660156, "kl": 1.0094499439001083, "learning_rate": 1.188888888888889e-05, "loss": 0.0404, "num_tokens": 974519.0, "reward": 1.5189111232757568, "reward_std": 1.0298280715942383, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.07871220260858536, "rewards/belief_accuracy/std": 0.022310344502329826, "rewards/env_reward/mean": 0.8218495845794678, "rewards/env_reward/std": 0.7057132720947266, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 17.666667938232422, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1975, "frac_reward_zero_std": 0.0, "grad_norm": 2.5887300968170166, "kl": 1.6546301878988743, "learning_rate": 1.1777777777777778e-05, "loss": 0.0662, "num_tokens": 977004.0, "reward": 0.33133554458618164, "reward_std": 3.520890235900879, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.4333902597427368, "rewards/env_reward/std": 2.288926839828491, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.198, "frac_reward_zero_std": 0.0, "grad_norm": 2.219426155090332, "kl": 1.7353740334510803, "learning_rate": 1.1666666666666668e-05, "loss": 0.0694, "num_tokens": 979440.0, "reward": 0.5393995642662048, "reward_std": 0.5616854429244995, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.1666666716337204, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.6595997214317322, "rewards/env_reward/std": 0.3744569718837738, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1985, "frac_reward_zero_std": 1.0, "grad_norm": 0.01091479230672121, "kl": 0.2686502933502197, "learning_rate": 1.1555555555555556e-05, "loss": 0.0107, "num_tokens": 981872.0, "reward": 0.04923933744430542, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.2777777910232544, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5550484657287598, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 10.333333969116211, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.199, "frac_reward_zero_std": 1.0, "grad_norm": 0.06738461554050446, "kl": 1.1423609554767609, "learning_rate": 1.1444444444444446e-05, "loss": 0.0457, "num_tokens": 984335.0, "reward": 0.34038877487182617, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.4102592170238495, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 15.333333969116211, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1995, "frac_reward_zero_std": 1.0, "grad_norm": 0.11339511722326279, "kl": 1.6856607422232628, "learning_rate": 1.1333333333333334e-05, "loss": 0.0674, "num_tokens": 986813.0, "reward": 1.0687165260314941, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.8958110809326172, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 18.33333396911621, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2, "frac_reward_zero_std": 0.0, "grad_norm": 2.5888490676879883, "kl": 1.7492893785238266, "learning_rate": 1.1222222222222224e-05, "loss": 0.07, "num_tokens": 989300.0, "reward": -0.39356112480163574, "reward_std": 0.022566793486475945, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.08703704178333282, "rewards/belief_accuracy/std": 0.021276157349348068, "rewards/env_reward/mean": -0.11329999566078186, "rewards/env_reward/std": 0.05000000074505806, "rewards/format_valid/mean": 0.75, "rewards/format_valid/std": 0.28867512941360474, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 15.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2005, "frac_reward_zero_std": 0.0, "grad_norm": 1.996265172958374, "kl": 1.6013427376747131, "learning_rate": 1.1111111111111112e-05, "loss": 0.0641, "num_tokens": 991777.0, "reward": -0.2611404061317444, "reward_std": 0.13472223281860352, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.07962962985038757, "rewards/belief_accuracy/std": 0.04074074327945709, "rewards/env_reward/mean": -0.035667672753334045, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 21.666667938232422, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.201, "frac_reward_zero_std": 0.0, "grad_norm": 2.6548614501953125, "kl": 1.0493617877364159, "learning_rate": 1.1000000000000001e-05, "loss": 0.042, "num_tokens": 994274.0, "reward": -0.802269697189331, "reward_std": 2.765153408050537, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.3223464787006378, "rewards/env_reward/std": 1.785102367401123, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 23.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2015, "frac_reward_zero_std": 0.0, "grad_norm": 2.575714588165283, "kl": 0.9777809828519821, "learning_rate": 1.088888888888889e-05, "loss": 0.0391, "num_tokens": 996785.0, "reward": 0.05824078619480133, "reward_std": 0.025000005960464478, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": 0.25132718682289124, "rewards/env_reward/std": 0.07500001043081284, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.202, "frac_reward_zero_std": 1.0, "grad_norm": 0.14137566089630127, "kl": 1.6623368486762047, "learning_rate": 1.0777777777777778e-05, "loss": 0.0665, "num_tokens": 999243.0, "reward": 0.7642103433609009, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.1859063357114792, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.8479529023170471, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2025, "frac_reward_zero_std": 0.0, "grad_norm": 3.3691229820251465, "kl": 1.845133326947689, "learning_rate": 1.0666666666666667e-05, "loss": 0.0738, "num_tokens": 1001717.0, "reward": -1.5100492238998413, "reward_std": 2.2933003902435303, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.794199526309967, "rewards/env_reward/std": 1.4705337285995483, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.203, "frac_reward_zero_std": 1.0, "grad_norm": 0.1297660768032074, "kl": 1.9939737766981125, "learning_rate": 1.0555555555555555e-05, "loss": 0.0798, "num_tokens": 1004190.0, "reward": 0.8893073201179504, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.7762049436569214, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 17.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2035, "frac_reward_zero_std": 1.0, "grad_norm": 0.20842799544334412, "kl": 1.046971783041954, "learning_rate": 1.0444444444444445e-05, "loss": 0.0419, "num_tokens": 1006688.0, "reward": 0.3194568157196045, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.39630457758903503, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.204, "frac_reward_zero_std": 1.0, "grad_norm": 0.02985587902367115, "kl": 0.5202329754829407, "learning_rate": 1.0333333333333333e-05, "loss": 0.0208, "num_tokens": 1008896.0, "reward": 1.842806100845337, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.1952041387557983, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2045, "frac_reward_zero_std": 0.0, "grad_norm": 2.0121374130249023, "kl": 1.4389217272400856, "learning_rate": 1.0222222222222223e-05, "loss": 0.0576, "num_tokens": 1011378.0, "reward": 0.06736606359481812, "reward_std": 0.008096039295196533, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10073399543762207, "rewards/belief_accuracy/std": 0.0014679878950119019, "rewards/env_reward/mean": 0.22554537653923035, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.205, "frac_reward_zero_std": 1.0, "grad_norm": 6.176392078399658, "kl": 3.0382063947618008, "learning_rate": 1.0111111111111111e-05, "loss": 0.1215, "num_tokens": 1013594.0, "reward": 1.1761062145233154, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.7507375478744507, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 15.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2055, "frac_reward_zero_std": 1.0, "grad_norm": 0.10501120984554291, "kl": 1.108486369252205, "learning_rate": 1e-05, "loss": 0.0443, "num_tokens": 1016088.0, "reward": 1.6641442775726318, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.2927628755569458, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.206, "frac_reward_zero_std": 0.0, "grad_norm": 3.7421345710754395, "kl": 1.0211132764816284, "learning_rate": 9.888888888888889e-06, "loss": 0.0408, "num_tokens": 1018571.0, "reward": -0.005715020000934601, "reward_std": 0.26651811599731445, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.05766364932060242, "rewards/belief_accuracy/std": 0.084672711789608, "rewards/env_reward/mean": 0.0906839445233345, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2065, "frac_reward_zero_std": 0.0, "grad_norm": 2.120497226715088, "kl": 1.77897572144866, "learning_rate": 9.777777777777779e-06, "loss": 0.0712, "num_tokens": 1020803.0, "reward": -0.13511165976524353, "reward_std": 0.4962805509567261, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.12340778857469559, "rewards/env_reward/std": 0.33085373044013977, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.207, "frac_reward_zero_std": 0.0, "grad_norm": 3.10685396194458, "kl": 0.5586464405059814, "learning_rate": 9.666666666666667e-06, "loss": 0.0223, "num_tokens": 1023235.0, "reward": 0.6811659336090088, "reward_std": 0.055555541068315506, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.1930294632911682, "rewards/belief_accuracy/std": 0.018518514931201935, "rewards/env_reward/mean": 0.8068363070487976, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 15.666666984558105, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.2075, "frac_reward_zero_std": 1.0, "grad_norm": 0.09224820137023926, "kl": 1.9065433144569397, "learning_rate": 9.555555555555556e-06, "loss": 0.0763, "num_tokens": 1025714.0, "reward": 0.590803325176239, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5772022604942322, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 13.333333969116211, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.208, "frac_reward_zero_std": 1.0, "grad_norm": 0.12466186285018921, "kl": 1.163508489727974, "learning_rate": 9.444444444444445e-06, "loss": 0.0465, "num_tokens": 1028186.0, "reward": 1.1605198383331299, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9570131897926331, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 22.33333396911621, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2085, "frac_reward_zero_std": 1.0, "grad_norm": 0.09261281788349152, "kl": 1.1156965792179108, "learning_rate": 9.333333333333334e-06, "loss": 0.0446, "num_tokens": 1030685.0, "reward": -1.266386866569519, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6609245538711548, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.209, "frac_reward_zero_std": 0.0, "grad_norm": 3.7526919841766357, "kl": 1.766396388411522, "learning_rate": 9.222222222222222e-06, "loss": 0.0707, "num_tokens": 1033139.0, "reward": 1.3138718605041504, "reward_std": 0.20983462035655975, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.05388889089226723, "rewards/belief_accuracy/std": 0.06600598990917206, "rewards/env_reward/mean": 0.9586923718452454, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.75, "rewards/format_valid/std": 0.28867512941360474, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2095, "frac_reward_zero_std": 0.0, "grad_norm": 2.7062177658081055, "kl": 1.5106076151132584, "learning_rate": 9.111111111111112e-06, "loss": 0.0604, "num_tokens": 1035612.0, "reward": -0.5769614577293396, "reward_std": 0.08749997615814209, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": -0.1721409559249878, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 10.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.21, "frac_reward_zero_std": 0.0, "grad_norm": 2.923778772354126, "kl": 1.7271861359477043, "learning_rate": 9e-06, "loss": 0.0691, "num_tokens": 1038074.0, "reward": 0.25312697887420654, "reward_std": 0.15385065972805023, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.07644156366586685, "rewards/belief_accuracy/std": 0.047116879373788834, "rewards/env_reward/mean": 0.3008011281490326, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2105, "frac_reward_zero_std": 0.0, "grad_norm": 5.066768169403076, "kl": 2.71373450756073, "learning_rate": 8.88888888888889e-06, "loss": 0.1085, "num_tokens": 1040516.0, "reward": -0.3499433994293213, "reward_std": 3.066704273223877, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.020795553922653198, "rewards/env_reward/std": 1.9861361980438232, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 13.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.211, "frac_reward_zero_std": 1.0, "grad_norm": 0.10381243377923965, "kl": 1.082069344818592, "learning_rate": 8.777777777777778e-06, "loss": 0.0433, "num_tokens": 1042987.0, "reward": -0.5439499616622925, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.179299995303154, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 9.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.2115, "frac_reward_zero_std": 1.0, "grad_norm": 0.11354008316993713, "kl": 1.3446319997310638, "learning_rate": 8.666666666666668e-06, "loss": 0.0538, "num_tokens": 1045446.0, "reward": -0.7405999898910522, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.31040000915527344, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.212, "frac_reward_zero_std": 1.0, "grad_norm": 0.14288963377475739, "kl": 1.392886459827423, "learning_rate": 8.555555555555556e-06, "loss": 0.0557, "num_tokens": 1047922.0, "reward": 1.2015879154205322, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9843919277191162, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2125, "frac_reward_zero_std": 1.0, "grad_norm": 0.17405332624912262, "kl": 1.4787537753582, "learning_rate": 8.444444444444446e-06, "loss": 0.0592, "num_tokens": 1050410.0, "reward": 2.462561845779419, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.8250410556793213, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 18.666667938232422, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.213, "frac_reward_zero_std": 0.0, "grad_norm": 2.2080841064453125, "kl": 1.0187080278992653, "learning_rate": 8.333333333333334e-06, "loss": 0.0407, "num_tokens": 1052898.0, "reward": 0.6277508735656738, "reward_std": 0.5285455584526062, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.21416667103767395, "rewards/belief_accuracy/std": 0.008333340287208557, "rewards/env_reward/mean": 0.8135005235671997, "rewards/env_reward/std": 0.3554683029651642, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2135, "frac_reward_zero_std": 0.0, "grad_norm": 3.2846176624298096, "kl": 1.8003827184438705, "learning_rate": 8.222222222222223e-06, "loss": 0.072, "num_tokens": 1055354.0, "reward": -0.048611536622047424, "reward_std": 0.3375000059604645, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.15833333134651184, "rewards/belief_accuracy/std": 0.11666666716337204, "rewards/env_reward/mean": 0.26342564821243286, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.214, "frac_reward_zero_std": 0.0, "grad_norm": 1.872238278388977, "kl": 1.2767575085163116, "learning_rate": 8.111111111111112e-06, "loss": 0.0511, "num_tokens": 1057827.0, "reward": -0.1957390159368515, "reward_std": 0.0825425460934639, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.12407407909631729, "rewards/belief_accuracy/std": 0.03164445981383324, "rewards/env_reward/mean": 0.0926554724574089, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.75, "rewards/format_valid/std": 0.28867512941360474, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 16.666667938232422, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2145, "frac_reward_zero_std": 1.0, "grad_norm": 0.09198321402072906, "kl": 1.0719245225191116, "learning_rate": 8.000000000000001e-06, "loss": 0.0429, "num_tokens": 1060309.0, "reward": -1.1536386013031006, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5857589840888977, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.215, "frac_reward_zero_std": 0.0, "grad_norm": 3.5375423431396484, "kl": 1.4234507977962494, "learning_rate": 7.88888888888889e-06, "loss": 0.0569, "num_tokens": 1062804.0, "reward": -0.4885933995246887, "reward_std": 2.974423408508301, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10740740597248077, "rewards/belief_accuracy/std": 0.014814812690019608, "rewards/env_reward/mean": -0.10258075594902039, "rewards/env_reward/std": 1.9316128492355347, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2155, "frac_reward_zero_std": 0.0, "grad_norm": 3.5626556873321533, "kl": 1.6539364457130432, "learning_rate": 7.777777777777777e-06, "loss": 0.0662, "num_tokens": 1065015.0, "reward": 1.4107515811920166, "reward_std": 0.4393588900566101, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9071676731109619, "rewards/env_reward/std": 0.2929059565067291, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.216, "frac_reward_zero_std": 1.0, "grad_norm": 0.38476383686065674, "kl": 3.078851878643036, "learning_rate": 7.666666666666667e-06, "loss": 0.1232, "num_tokens": 1067451.0, "reward": 0.9827893376350403, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.8385262489318848, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2165, "frac_reward_zero_std": 1.0, "grad_norm": 0.14623303711414337, "kl": 1.1345348209142685, "learning_rate": 7.555555555555556e-06, "loss": 0.0454, "num_tokens": 1069941.0, "reward": 1.2882843017578125, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.042189598083496, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.217, "frac_reward_zero_std": 0.0, "grad_norm": 2.191251754760742, "kl": 1.4331532195210457, "learning_rate": 7.444444444444444e-06, "loss": 0.0573, "num_tokens": 1072149.0, "reward": 0.511351466178894, "reward_std": 0.19517327845096588, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.30756765604019165, "rewards/env_reward/std": 0.13011550903320312, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2175, "frac_reward_zero_std": 0.0, "grad_norm": 3.433861017227173, "kl": 1.154853031039238, "learning_rate": 7.333333333333334e-06, "loss": 0.0462, "num_tokens": 1074653.0, "reward": 1.1034250259399414, "reward_std": 0.7286821603775024, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10551381856203079, "rewards/belief_accuracy/std": 0.04305478185415268, "rewards/env_reward/mean": 0.9174776673316956, "rewards/env_reward/std": 0.4061686098575592, "rewards/format_valid/mean": 0.875, "rewards/format_valid/std": 0.25, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 19.33333396911621, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.218, "frac_reward_zero_std": 0.0, "grad_norm": 4.4158477783203125, "kl": 1.3408091366291046, "learning_rate": 7.222222222222222e-06, "loss": 0.0536, "num_tokens": 1077143.0, "reward": -1.1806925535202026, "reward_std": 2.512871503829956, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5746283531188965, "rewards/env_reward/std": 1.6169143915176392, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2185, "frac_reward_zero_std": 1.0, "grad_norm": 0.19002826511859894, "kl": 0.8864307105541229, "learning_rate": 7.111111111111112e-06, "loss": 0.0355, "num_tokens": 1079650.0, "reward": 0.4842817187309265, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5061878561973572, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 14.333333969116211, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.219, "frac_reward_zero_std": 1.0, "grad_norm": 0.25234878063201904, "kl": 1.5277681648731232, "learning_rate": 7.000000000000001e-06, "loss": 0.0611, "num_tokens": 1082125.0, "reward": 1.2354192733764648, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.0069462060928345, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2195, "frac_reward_zero_std": 1.0, "grad_norm": 2.109527587890625, "kl": 2.2620955407619476, "learning_rate": 6.888888888888889e-06, "loss": 0.0905, "num_tokens": 1084577.0, "reward": -0.516443133354187, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.16096210479736328, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 20.33333396911621, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.22, "frac_reward_zero_std": 0.0, "grad_norm": 3.669363021850586, "kl": 1.6046917140483856, "learning_rate": 6.777777777777779e-06, "loss": 0.0642, "num_tokens": 1087070.0, "reward": 0.5990688800811768, "reward_std": 0.2774999737739563, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.05583333224058151, "rewards/belief_accuracy/std": 0.08833333104848862, "rewards/env_reward/mean": 0.490212619304657, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 16.666667938232422, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2205, "frac_reward_zero_std": 1.0, "grad_norm": 0.10930506885051727, "kl": 1.5687530785799026, "learning_rate": 6.666666666666667e-06, "loss": 0.0628, "num_tokens": 1089552.0, "reward": 0.4188321828842163, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.46255481243133545, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 14.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.221, "frac_reward_zero_std": 1.0, "grad_norm": 0.11199460923671722, "kl": 1.3371323496103287, "learning_rate": 6.555555555555556e-06, "loss": 0.0535, "num_tokens": 1092045.0, "reward": 1.3014123439788818, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.0509415864944458, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2215, "frac_reward_zero_std": 0.0, "grad_norm": 2.5043163299560547, "kl": 2.0148645490407944, "learning_rate": 6.4444444444444445e-06, "loss": 0.0806, "num_tokens": 1094500.0, "reward": -0.14020271599292755, "reward_std": 0.3196193277835846, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.03164725750684738, "rewards/belief_accuracy/std": 0.0914727970957756, "rewards/env_reward/mean": -0.055173955857753754, "rewards/env_reward/std": 0.07499999552965164, "rewards/format_valid/mean": 0.75, "rewards/format_valid/std": 0.28867512941360474, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 14.666666984558105, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.222, "frac_reward_zero_std": 1.0, "grad_norm": 0.22092895209789276, "kl": 1.370383344590664, "learning_rate": 6.333333333333334e-06, "loss": 0.0548, "num_tokens": 1096976.0, "reward": 1.091750144958496, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.911166787147522, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 10.666666984558105, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2225, "frac_reward_zero_std": 1.0, "grad_norm": 0.20150171220302582, "kl": 1.3967806994915009, "learning_rate": 6.222222222222222e-06, "loss": 0.0559, "num_tokens": 1099440.0, "reward": 0.020131230354309082, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.19675415754318237, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.223, "frac_reward_zero_std": 1.0, "grad_norm": 0.15836045145988464, "kl": 2.047119751572609, "learning_rate": 6.111111111111111e-06, "loss": 0.0819, "num_tokens": 1101904.0, "reward": 0.2909669280052185, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3773113191127777, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 17.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2235, "frac_reward_zero_std": 1.0, "grad_norm": 0.17508073151111603, "kl": 1.1480788886547089, "learning_rate": 6e-06, "loss": 0.0459, "num_tokens": 1104403.0, "reward": 1.1857414245605469, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9738277196884155, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.224, "frac_reward_zero_std": 1.0, "grad_norm": 0.20271523296833038, "kl": 1.4859704226255417, "learning_rate": 5.888888888888889e-06, "loss": 0.0594, "num_tokens": 1106863.0, "reward": -0.151106059551239, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.08259596675634384, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 18.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.2245, "frac_reward_zero_std": 1.0, "grad_norm": 0.14578959345817566, "kl": 1.7562050223350525, "learning_rate": 5.777777777777778e-06, "loss": 0.0702, "num_tokens": 1109349.0, "reward": -0.9775741100311279, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.46838271617889404, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 19.33333396911621, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.225, "frac_reward_zero_std": 1.0, "grad_norm": 0.16451287269592285, "kl": 1.11443629860878, "learning_rate": 5.666666666666667e-06, "loss": 0.0446, "num_tokens": 1111839.0, "reward": 0.36123454570770264, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.42415639758110046, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.2255, "frac_reward_zero_std": 1.0, "grad_norm": 0.42332857847213745, "kl": 1.8851738721132278, "learning_rate": 5.555555555555556e-06, "loss": 0.0754, "num_tokens": 1114291.0, "reward": 0.47187477350234985, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.4979165196418762, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.226, "frac_reward_zero_std": 0.0, "grad_norm": 3.978724718093872, "kl": 1.1868759840726852, "learning_rate": 5.444444444444445e-06, "loss": 0.0475, "num_tokens": 1116798.0, "reward": 1.1072639226913452, "reward_std": 0.21830104291439056, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9215093851089478, "rewards/env_reward/std": 0.14553406834602356, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2265, "frac_reward_zero_std": 1.0, "grad_norm": 0.23165586590766907, "kl": 1.9865920096635818, "learning_rate": 5.333333333333334e-06, "loss": 0.0795, "num_tokens": 1119262.0, "reward": -0.4410591125488281, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.11070608347654343, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 15.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.227, "frac_reward_zero_std": 0.0, "grad_norm": 3.738908290863037, "kl": 1.3586938455700874, "learning_rate": 5.2222222222222226e-06, "loss": 0.0543, "num_tokens": 1121756.0, "reward": 0.3270404040813446, "reward_std": 0.1799306422472, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.40136027336120605, "rewards/env_reward/std": 0.11995376646518707, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.2275, "frac_reward_zero_std": 0.0, "grad_norm": 5.68782901763916, "kl": 2.2840545773506165, "learning_rate": 5.1111111111111115e-06, "loss": 0.0914, "num_tokens": 1124196.0, "reward": 0.7064803242683411, "reward_std": 0.6930884122848511, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.654320240020752, "rewards/env_reward/std": 0.4620589315891266, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 15.333333969116211, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.228, "frac_reward_zero_std": 1.0, "grad_norm": 0.14986330270767212, "kl": 1.6589103937149048, "learning_rate": 5e-06, "loss": 0.0664, "num_tokens": 1126674.0, "reward": 1.4591007232666016, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.156067132949829, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2285, "frac_reward_zero_std": 0.0, "grad_norm": 2.5412628650665283, "kl": 0.8467085063457489, "learning_rate": 4.888888888888889e-06, "loss": 0.0339, "num_tokens": 1129174.0, "reward": 0.6049916744232178, "reward_std": 0.3218177258968353, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.586661159992218, "rewards/env_reward/std": 0.21454516053199768, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 15.333333969116211, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.229, "frac_reward_zero_std": 1.0, "grad_norm": 0.17499062418937683, "kl": 1.236129179596901, "learning_rate": 4.777777777777778e-06, "loss": 0.0494, "num_tokens": 1131652.0, "reward": -0.4889889657497406, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.14265930652618408, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 21.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2295, "frac_reward_zero_std": 1.0, "grad_norm": 0.23451776802539825, "kl": 1.0196955502033234, "learning_rate": 4.666666666666667e-06, "loss": 0.0408, "num_tokens": 1134147.0, "reward": 0.9018483757972717, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.7845656275749207, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.23, "frac_reward_zero_std": 1.0, "grad_norm": 0.290233314037323, "kl": 0.6138657331466675, "learning_rate": 4.555555555555556e-06, "loss": 0.0246, "num_tokens": 1136355.0, "reward": 1.1506599187850952, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.7337732911109924, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2305, "frac_reward_zero_std": 0.0, "grad_norm": 4.5567545890808105, "kl": 1.081925056874752, "learning_rate": 4.444444444444445e-06, "loss": 0.0433, "num_tokens": 1138814.0, "reward": -0.3285835385322571, "reward_std": 3.080944299697876, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.006555706262588501, "rewards/env_reward/std": 1.9956294298171997, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.231, "frac_reward_zero_std": 1.0, "grad_norm": 0.20140407979488373, "kl": 1.9097826182842255, "learning_rate": 4.333333333333334e-06, "loss": 0.0764, "num_tokens": 1141280.0, "reward": 0.6215116381645203, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5976744294166565, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 19.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2315, "frac_reward_zero_std": 1.0, "grad_norm": 0.226417675614357, "kl": 1.086095541715622, "learning_rate": 4.222222222222223e-06, "loss": 0.0434, "num_tokens": 1143769.0, "reward": 1.0261733531951904, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.8674488663673401, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.232, "frac_reward_zero_std": 0.0, "grad_norm": 3.3003480434417725, "kl": 1.2525041699409485, "learning_rate": 4.111111111111112e-06, "loss": 0.0501, "num_tokens": 1146261.0, "reward": 0.5109961032867432, "reward_std": 0.11840394884347916, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10109934210777283, "rewards/belief_accuracy/std": 0.002198692411184311, "rewards/env_reward/mean": 0.5220293998718262, "rewards/env_reward/std": 0.07499998807907104, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.2325, "frac_reward_zero_std": 1.0, "grad_norm": 0.3115181028842926, "kl": 1.4747485369443893, "learning_rate": 4.000000000000001e-06, "loss": 0.059, "num_tokens": 1148737.0, "reward": -1.0867280960083008, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5411520600318909, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.233, "frac_reward_zero_std": 1.0, "grad_norm": 0.24885617196559906, "kl": 1.9252333343029022, "learning_rate": 3.888888888888889e-06, "loss": 0.077, "num_tokens": 1151161.0, "reward": 0.2553900480270386, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3535933792591095, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.2335, "frac_reward_zero_std": 1.0, "grad_norm": 0.16646219789981842, "kl": 1.6244457215070724, "learning_rate": 3.777777777777778e-06, "loss": 0.065, "num_tokens": 1153644.0, "reward": 1.1453707218170166, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9469137787818909, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 15.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.234, "frac_reward_zero_std": 0.0, "grad_norm": 9.400774955749512, "kl": 1.2703840136528015, "learning_rate": 3.666666666666667e-06, "loss": 0.0508, "num_tokens": 1156121.0, "reward": -0.3506682515144348, "reward_std": 0.22495710849761963, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.050445497035980225, "rewards/env_reward/std": 0.14997142553329468, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 13.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2345, "frac_reward_zero_std": 0.0, "grad_norm": 2.9656176567077637, "kl": 1.1122565567493439, "learning_rate": 3.555555555555556e-06, "loss": 0.0445, "num_tokens": 1158592.0, "reward": 0.20864993333816528, "reward_std": 0.5419493913650513, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.32243332266807556, "rewards/env_reward/std": 0.36129963397979736, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.235, "frac_reward_zero_std": 0.0, "grad_norm": 2.9808571338653564, "kl": 1.142757073044777, "learning_rate": 3.4444444444444444e-06, "loss": 0.0457, "num_tokens": 1161094.0, "reward": 0.7835899591445923, "reward_std": 0.17083337903022766, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.06111111119389534, "rewards/belief_accuracy/std": 0.07777778059244156, "rewards/env_reward/mean": 0.6237821578979492, "rewards/env_reward/std": 0.04999998211860657, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2355, "frac_reward_zero_std": 1.0, "grad_norm": 0.19158995151519775, "kl": 1.6587612330913544, "learning_rate": 3.3333333333333333e-06, "loss": 0.0664, "num_tokens": 1163538.0, "reward": -0.2371583878993988, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.02522774413228035, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 10.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.236, "frac_reward_zero_std": 1.0, "grad_norm": 0.27240458130836487, "kl": 0.896557942032814, "learning_rate": 3.2222222222222222e-06, "loss": 0.0359, "num_tokens": 1166023.0, "reward": 1.0796802043914795, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.903120219707489, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 18.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2365, "frac_reward_zero_std": 0.0, "grad_norm": 2.825577735900879, "kl": 0.8296048268675804, "learning_rate": 3.111111111111111e-06, "loss": 0.0332, "num_tokens": 1168524.0, "reward": 1.6908910274505615, "reward_std": 0.2124999761581421, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.06666666269302368, "rewards/belief_accuracy/std": 0.06666667014360428, "rewards/env_reward/mean": 1.2397607564926147, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 15.333333969116211, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.237, "frac_reward_zero_std": 0.0, "grad_norm": 2.382891893386841, "kl": 1.113198146224022, "learning_rate": 3e-06, "loss": 0.0445, "num_tokens": 1171002.0, "reward": 1.2636022567749023, "reward_std": 0.38985100388526917, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.0257349014282227, "rewards/env_reward/std": 0.2599007189273834, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2375, "frac_reward_zero_std": 0.0, "grad_norm": 2.8809125423431396, "kl": 1.4048431143164635, "learning_rate": 2.888888888888889e-06, "loss": 0.0562, "num_tokens": 1173489.0, "reward": -0.06511135399341583, "reward_std": 0.047866158187389374, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.07713063061237335, "rewards/belief_accuracy/std": 0.045738738030195236, "rewards/env_reward/mean": 0.09002035856246948, "rewards/env_reward/std": 0.06790003925561905, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.238, "frac_reward_zero_std": 0.0, "grad_norm": 3.7238736152648926, "kl": 1.2160827964544296, "learning_rate": 2.777777777777778e-06, "loss": 0.0486, "num_tokens": 1175948.0, "reward": -0.18826928734779358, "reward_std": 0.18310457468032837, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.0715659037232399, "rewards/belief_accuracy/std": 0.05686819180846214, "rewards/env_reward/mean": -0.0032143734861165285, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 14.333333969116211, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2385, "frac_reward_zero_std": 0.0, "grad_norm": 4.449535846710205, "kl": 1.1292494237422943, "learning_rate": 2.666666666666667e-06, "loss": 0.0452, "num_tokens": 1178423.0, "reward": 0.36471137404441833, "reward_std": 0.32429030537605286, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.04803495109081268, "rewards/belief_accuracy/std": 0.10393010079860687, "rewards/env_reward/mean": 0.3183774948120117, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.239, "frac_reward_zero_std": 0.0, "grad_norm": 4.696188926696777, "kl": 1.435573399066925, "learning_rate": 2.5555555555555557e-06, "loss": 0.0574, "num_tokens": 1180889.0, "reward": 1.4060673713684082, "reward_std": 0.02499997615814209, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": 1.1498782634735107, "rewards/env_reward/std": 0.07499998807907104, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2395, "frac_reward_zero_std": 1.0, "grad_norm": 1.0978991985321045, "kl": 1.2610341310501099, "learning_rate": 2.4444444444444447e-06, "loss": 0.0504, "num_tokens": 1183393.0, "reward": -0.08134973049163818, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.12910018861293793, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.24, "frac_reward_zero_std": 0.0, "grad_norm": 3.3614814281463623, "kl": 1.5769162476062775, "learning_rate": 2.3333333333333336e-06, "loss": 0.0631, "num_tokens": 1185826.0, "reward": -0.26373326778411865, "reward_std": 1.0362647771835327, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.1836622953414917, "rewards/belief_accuracy/std": 0.10475655645132065, "rewards/env_reward/mean": 0.15816909074783325, "rewards/env_reward/std": 0.49835386872291565, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2405, "frac_reward_zero_std": 0.0, "grad_norm": 3.7180566787719727, "kl": 1.8103009164333344, "learning_rate": 2.2222222222222225e-06, "loss": 0.0724, "num_tokens": 1188286.0, "reward": 0.1593647599220276, "reward_std": 0.6118594408035278, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.28957653045654297, "rewards/env_reward/std": 0.40790632367134094, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 20.33333396911621, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.241, "frac_reward_zero_std": 1.0, "grad_norm": 0.3375169038772583, "kl": 0.9421973675489426, "learning_rate": 2.1111111111111114e-06, "loss": 0.0377, "num_tokens": 1190779.0, "reward": 0.6196067929267883, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5964045524597168, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 21.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2415, "frac_reward_zero_std": 1.0, "grad_norm": 0.2469029277563095, "kl": 1.1642567813396454, "learning_rate": 2.0000000000000003e-06, "loss": 0.0466, "num_tokens": 1193274.0, "reward": -0.17581841349601746, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.06612106412649155, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.242, "frac_reward_zero_std": 0.0, "grad_norm": 4.307186603546143, "kl": 1.7343009114265442, "learning_rate": 1.888888888888889e-06, "loss": 0.0694, "num_tokens": 1195727.0, "reward": 1.351982831954956, "reward_std": 0.02499997615814209, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": 1.1138218641281128, "rewards/env_reward/std": 0.07499998807907104, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 25.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2425, "frac_reward_zero_std": 1.0, "grad_norm": 0.24575987458229065, "kl": 1.0231992602348328, "learning_rate": 1.777777777777778e-06, "loss": 0.0409, "num_tokens": 1198234.0, "reward": -0.02136892080307007, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.16908739507198334, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 9.666666984558105, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.243, "frac_reward_zero_std": 0.0, "grad_norm": 5.336447715759277, "kl": 1.3090898543596268, "learning_rate": 1.6666666666666667e-06, "loss": 0.0524, "num_tokens": 1200695.0, "reward": -1.9136090278625488, "reward_std": 2.0242605209350586, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.063239336013794, "rewards/env_reward/std": 1.2911738157272339, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2435, "frac_reward_zero_std": 1.0, "grad_norm": 0.26777753233909607, "kl": 1.0288607757538557, "learning_rate": 1.5555555555555556e-06, "loss": 0.0412, "num_tokens": 1203157.0, "reward": 1.5726299285888672, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.2317533493041992, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.244, "frac_reward_zero_std": 1.0, "grad_norm": 0.23503462970256805, "kl": 0.559322252869606, "learning_rate": 1.4444444444444445e-06, "loss": 0.0224, "num_tokens": 1205685.0, "reward": 1.3888003826141357, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.109200358390808, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 13.333333969116211, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2445, "frac_reward_zero_std": 0.0, "grad_norm": 3.5624420642852783, "kl": 1.6142105609178543, "learning_rate": 1.3333333333333334e-06, "loss": 0.0646, "num_tokens": 1208157.0, "reward": 0.8933224678039551, "reward_std": 0.2008855640888214, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.08735239505767822, "rewards/belief_accuracy/std": 0.025295214727520943, "rewards/env_reward/mean": 0.7494198083877563, "rewards/env_reward/std": 0.07499998807907104, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.245, "frac_reward_zero_std": 1.0, "grad_norm": 0.3598785996437073, "kl": 0.725849099457264, "learning_rate": 1.2222222222222223e-06, "loss": 0.029, "num_tokens": 1210663.0, "reward": 1.589991807937622, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.2433278560638428, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.2455, "frac_reward_zero_std": 0.0, "grad_norm": 2.048388719558716, "kl": 0.6919376142323017, "learning_rate": 1.1111111111111112e-06, "loss": 0.0277, "num_tokens": 1213132.0, "reward": -0.04184141755104065, "reward_std": 0.977747917175293, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.1554390788078308, "rewards/env_reward/std": 0.6518319249153137, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 21.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.246, "frac_reward_zero_std": 1.0, "grad_norm": 0.5983917713165283, "kl": 1.0341777577996254, "learning_rate": 1.0000000000000002e-06, "loss": 0.0414, "num_tokens": 1215627.0, "reward": 0.6422335505485535, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.6114890575408936, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2465, "frac_reward_zero_std": 0.0, "grad_norm": 2.7554574012756348, "kl": 0.5243086963891983, "learning_rate": 8.88888888888889e-07, "loss": 0.021, "num_tokens": 1217835.0, "reward": 1.290155291557312, "reward_std": 0.05980373173952103, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.8267701864242554, "rewards/env_reward/std": 0.039869144558906555, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.247, "frac_reward_zero_std": 0.0, "grad_norm": 4.632870197296143, "kl": 1.0271499007940292, "learning_rate": 7.777777777777778e-07, "loss": 0.0411, "num_tokens": 1220308.0, "reward": 0.16512584686279297, "reward_std": 0.20950853824615479, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.2934172749519348, "rewards/env_reward/std": 0.13967236876487732, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2475, "frac_reward_zero_std": 1.0, "grad_norm": 0.19535107910633087, "kl": 1.8744446635246277, "learning_rate": 6.666666666666667e-07, "loss": 0.075, "num_tokens": 1222771.0, "reward": 1.4706852436065674, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.163790225982666, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.248, "frac_reward_zero_std": 0.0, "grad_norm": 2.917879581451416, "kl": 1.0786767601966858, "learning_rate": 5.555555555555556e-07, "loss": 0.0431, "num_tokens": 1225233.0, "reward": -0.2330722212791443, "reward_std": 0.031944431364536285, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10740740597248077, "rewards/belief_accuracy/std": 0.014814812690019608, "rewards/env_reward/mean": 0.038600001484155655, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2485, "frac_reward_zero_std": 0.0, "grad_norm": 2.6436572074890137, "kl": 1.1599683165550232, "learning_rate": 4.444444444444445e-07, "loss": 0.0464, "num_tokens": 1227699.0, "reward": 1.316786289215088, "reward_std": 0.02361110784113407, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.0981481522321701, "rewards/belief_accuracy/std": 0.0037037059664726257, "rewards/env_reward/mean": 1.0533205270767212, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.249, "frac_reward_zero_std": 1.0, "grad_norm": 0.323911190032959, "kl": 1.486757069826126, "learning_rate": 3.3333333333333335e-07, "loss": 0.0595, "num_tokens": 1230145.0, "reward": 1.410203218460083, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.1234688758850098, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2495, "frac_reward_zero_std": 0.0, "grad_norm": 3.010908603668213, "kl": 0.8313806504011154, "learning_rate": 2.2222222222222224e-07, "loss": 0.0333, "num_tokens": 1232632.0, "reward": -0.182204470038414, "reward_std": 0.07837501168251038, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.06186369061470032, "rewards/env_reward/std": 0.052250005304813385, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.25, "frac_reward_zero_std": 1.0, "grad_norm": 0.2790248990058899, "kl": 1.8732239753007889, "learning_rate": 1.1111111111111112e-07, "loss": 0.0749, "num_tokens": 1235072.0, "reward": 1.3762123584747314, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.1008082628250122, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 500 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 1235072, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }