{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.125, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0005, "frac_reward_zero_std": 1.0, "grad_norm": 0.32243695855140686, "kl": 0.016345822252333164, "learning_rate": 0.0, "loss": 0.0007, "num_tokens": 2516.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 23.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.001, "frac_reward_zero_std": 1.0, "grad_norm": 0.1738930642604828, "kl": 0.0056577762588858604, "learning_rate": 1.0000000000000002e-06, "loss": 0.0002, "num_tokens": 5035.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0015, "frac_reward_zero_std": 0.0, "grad_norm": 53.453521728515625, "kl": 0.027107596397399902, "learning_rate": 2.0000000000000003e-06, "loss": 0.0011, "num_tokens": 7545.0, "reward": -3.724677085876465, "reward_std": 2.4506454467773438, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.212284803390503, "rewards/env_reward/std": 1.5754303932189941, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.002, "frac_reward_zero_std": 1.0, "grad_norm": 1.2984156608581543, "kl": 0.013630361296236515, "learning_rate": 3e-06, "loss": 0.0005, "num_tokens": 10063.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0025, "frac_reward_zero_std": 0.0, "grad_norm": 61.420711517333984, "kl": 0.0825746851041913, "learning_rate": 4.000000000000001e-06, "loss": 0.0033, "num_tokens": 12536.0, "reward": -3.895512342453003, "reward_std": 2.1089749336242676, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.3261749744415283, "rewards/env_reward/std": 1.3476500511169434, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.33333396911621, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.003, "frac_reward_zero_std": 0.0, "grad_norm": 11.032736778259277, "kl": 0.0035573970526456833, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 15059.0, "reward": -2.084261417388916, "reward_std": 3.3090696334838867, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.147840976715088, "rewards/env_reward/std": 2.1386890411376953, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.0035, "frac_reward_zero_std": 1.0, "grad_norm": 0.20612499117851257, "kl": 0.007132542319595814, "learning_rate": 6e-06, "loss": 0.0003, "num_tokens": 17568.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.004, "frac_reward_zero_std": 0.0, "grad_norm": 10.10438346862793, "kl": 0.05212839285377413, "learning_rate": 7.000000000000001e-06, "loss": 0.0021, "num_tokens": 20049.0, "reward": -3.4786999225616455, "reward_std": 2.9425997734069824, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.04830002784729, "rewards/env_reward/std": 1.90339994430542, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0045, "frac_reward_zero_std": 1.0, "grad_norm": 0.36320337653160095, "kl": 0.005910599138587713, "learning_rate": 8.000000000000001e-06, "loss": 0.0002, "num_tokens": 22569.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.005, "frac_reward_zero_std": 1.0, "grad_norm": 35.37952423095703, "kl": 0.214208863559179, "learning_rate": 9e-06, "loss": 0.0086, "num_tokens": 25052.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0055, "frac_reward_zero_std": 0.0, "grad_norm": 219.09710693359375, "kl": 0.09427966503426433, "learning_rate": 1e-05, "loss": 0.0038, "num_tokens": 27536.0, "reward": -2.487870216369629, "reward_std": 2.853968858718872, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.4169135093688965, "rewards/env_reward/std": 1.8355563879013062, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.006, "frac_reward_zero_std": 1.0, "grad_norm": 5.1641130447387695, "kl": 0.02741223480552435, "learning_rate": 1.1000000000000001e-05, "loss": 0.0011, "num_tokens": 30040.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0065, "frac_reward_zero_std": 0.0, "grad_norm": 18.722599029541016, "kl": 0.16925985834677704, "learning_rate": 1.2e-05, "loss": 0.0068, "num_tokens": 32510.0, "reward": -2.37943172454834, "reward_std": 2.9682364463806152, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.3446213006973267, "rewards/env_reward/std": 1.9114667177200317, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.007, "frac_reward_zero_std": 1.0, "grad_norm": 0.1022278442978859, "kl": 0.006297597661614418, "learning_rate": 1.3000000000000001e-05, "loss": 0.0003, "num_tokens": 35028.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.0075, "frac_reward_zero_std": 1.0, "grad_norm": 0.032320525497198105, "kl": 0.002568609546869993, "learning_rate": 1.4000000000000001e-05, "loss": 0.0001, "num_tokens": 37556.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 22.33333396911621, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.008, "frac_reward_zero_std": 1.0, "grad_norm": 8.214491844177246, "kl": 0.041143732611089945, "learning_rate": 1.5e-05, "loss": 0.0016, "num_tokens": 40055.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0085, "frac_reward_zero_std": 0.0, "grad_norm": 39.8804817199707, "kl": 0.04003936113440432, "learning_rate": 1.6000000000000003e-05, "loss": 0.0016, "num_tokens": 42531.0, "reward": -2.5680184364318848, "reward_std": 2.750802993774414, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.11806440353393555, "rewards/belief_accuracy/std": 0.03612881526350975, "rewards/env_reward/mean": -1.4383834600448608, "rewards/env_reward/std": 1.8049958944320679, "rewards/format_valid/mean": -0.625, "rewards/format_valid/std": 1.6007810831069946, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.009, "frac_reward_zero_std": 1.0, "grad_norm": 1.495725393295288, "kl": 0.019086187705397606, "learning_rate": 1.7000000000000003e-05, "loss": 0.0008, "num_tokens": 45044.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0095, "frac_reward_zero_std": 1.0, "grad_norm": 26.531925201416016, "kl": 0.09912175685167313, "learning_rate": 1.8e-05, "loss": 0.004, "num_tokens": 47535.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.01, "frac_reward_zero_std": 0.0, "grad_norm": 8.011573791503906, "kl": 0.0038544870913028717, "learning_rate": 1.9e-05, "loss": 0.0002, "num_tokens": 50063.0, "reward": -2.320432662963867, "reward_std": 3.037968873977661, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.305288553237915, "rewards/env_reward/std": 1.9579919576644897, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.0105, "frac_reward_zero_std": 1.0, "grad_norm": 36.5081787109375, "kl": 0.2546631218865514, "learning_rate": 2e-05, "loss": 0.0102, "num_tokens": 52561.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 26.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.011, "frac_reward_zero_std": 1.0, "grad_norm": 1.0799552202224731, "kl": 0.009861491620540619, "learning_rate": 2.1e-05, "loss": 0.0004, "num_tokens": 55083.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 20.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.0115, "frac_reward_zero_std": 1.0, "grad_norm": 23.75230598449707, "kl": 0.20189414219930768, "learning_rate": 2.2000000000000003e-05, "loss": 0.0081, "num_tokens": 57588.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.012, "frac_reward_zero_std": 0.0, "grad_norm": 102.0042953491211, "kl": 0.1681511290371418, "learning_rate": 2.3000000000000003e-05, "loss": 0.0067, "num_tokens": 60080.0, "reward": -1.766066074371338, "reward_std": 2.126420736312866, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.9648774862289429, "rewards/env_reward/std": 1.3593891859054565, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 13.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0125, "frac_reward_zero_std": 1.0, "grad_norm": 2.4423506259918213, "kl": 0.0637103128246963, "learning_rate": 2.4e-05, "loss": 0.0025, "num_tokens": 62571.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013, "frac_reward_zero_std": 1.0, "grad_norm": 0.05712759494781494, "kl": 0.005990173202008009, "learning_rate": 2.5e-05, "loss": 0.0002, "num_tokens": 65098.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0135, "frac_reward_zero_std": 1.0, "grad_norm": 0.1384836584329605, "kl": 0.018408390693366528, "learning_rate": 2.6000000000000002e-05, "loss": 0.0007, "num_tokens": 67611.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.014, "frac_reward_zero_std": 1.0, "grad_norm": 0.17841196060180664, "kl": 0.008233492728322744, "learning_rate": 2.7000000000000002e-05, "loss": 0.0003, "num_tokens": 70139.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0145, "frac_reward_zero_std": 1.0, "grad_norm": 14.524484634399414, "kl": 0.07956769224256277, "learning_rate": 2.8000000000000003e-05, "loss": 0.0032, "num_tokens": 72647.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.015, "frac_reward_zero_std": 1.0, "grad_norm": 0.6826711297035217, "kl": 0.05026988545432687, "learning_rate": 2.9e-05, "loss": 0.002, "num_tokens": 75144.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 14.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0155, "frac_reward_zero_std": 0.0, "grad_norm": 11050.3515625, "kl": 36.80695866746828, "learning_rate": 3e-05, "loss": 1.4723, "num_tokens": 77637.0, "reward": -2.443718194961548, "reward_std": 2.9238996505737305, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.0741666704416275, "rewards/belief_accuracy/std": 0.05166666582226753, "rewards/env_reward/mean": -1.443312168121338, "rewards/env_reward/std": 1.8071939945220947, "rewards/format_valid/mean": -0.625, "rewards/format_valid/std": 1.6007810831069946, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 14.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.016, "frac_reward_zero_std": 1.0, "grad_norm": 5.319654941558838, "kl": 0.1096202852204442, "learning_rate": 3.1e-05, "loss": 0.0044, "num_tokens": 80130.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.0165, "frac_reward_zero_std": 0.0, "grad_norm": 28.577579498291016, "kl": 0.04399943072348833, "learning_rate": 3.2000000000000005e-05, "loss": 0.0018, "num_tokens": 82627.0, "reward": -3.7981131076812744, "reward_std": 2.3037734031677246, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.261242151260376, "rewards/env_reward/std": 1.477515697479248, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 16.666667938232422, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.017, "frac_reward_zero_std": 1.0, "grad_norm": 3.7505602836608887, "kl": 0.04482424072921276, "learning_rate": 3.3e-05, "loss": 0.0018, "num_tokens": 85109.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0175, "frac_reward_zero_std": 0.0, "grad_norm": 20.112499237060547, "kl": 0.0021229138001217507, "learning_rate": 3.4000000000000007e-05, "loss": 0.0001, "num_tokens": 87574.0, "reward": 0.1572304666042328, "reward_std": 0.04570581018924713, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.014735294505953789, "rewards/belief_accuracy/std": 0.09565715491771698, "rewards/env_reward/mean": 0.10095755755901337, "rewards/env_reward/std": 0.20054571330547333, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 17.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.018, "frac_reward_zero_std": 1.0, "grad_norm": 0.24909250438213348, "kl": 0.024185666348785162, "learning_rate": 3.5e-05, "loss": 0.001, "num_tokens": 90072.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 27.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0185, "frac_reward_zero_std": 0.0, "grad_norm": 11.509799003601074, "kl": 0.01711271144449711, "learning_rate": 3.6e-05, "loss": 0.0007, "num_tokens": 92595.0, "reward": -3.6846251487731934, "reward_std": 2.530749559402466, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.1855833530426025, "rewards/env_reward/std": 1.6288331747055054, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.019, "frac_reward_zero_std": 1.0, "grad_norm": 0.46738389134407043, "kl": 0.012128827278502285, "learning_rate": 3.7e-05, "loss": 0.0005, "num_tokens": 95085.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 26.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.0195, "frac_reward_zero_std": 1.0, "grad_norm": 2.194053888320923, "kl": 0.039654724299907684, "learning_rate": 3.8e-05, "loss": 0.0016, "num_tokens": 97595.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 22.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.02, "frac_reward_zero_std": 1.0, "grad_norm": 0.2551957964897156, "kl": 0.02670608414337039, "learning_rate": 3.9000000000000006e-05, "loss": 0.0011, "num_tokens": 100093.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0205, "frac_reward_zero_std": 1.0, "grad_norm": 0.4345109760761261, "kl": 0.010095613077282906, "learning_rate": 4e-05, "loss": 0.0004, "num_tokens": 102612.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 18.33333396911621, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.021, "frac_reward_zero_std": 1.0, "grad_norm": 0.6849669218063354, "kl": 0.08905280428007245, "learning_rate": 4.1e-05, "loss": 0.0036, "num_tokens": 105099.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0215, "frac_reward_zero_std": 0.0, "grad_norm": 113.153564453125, "kl": 0.12807448720559478, "learning_rate": 4.2e-05, "loss": 0.0051, "num_tokens": 107595.0, "reward": -2.9136834144592285, "reward_std": 2.423197031021118, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.700788974761963, "rewards/env_reward/std": 1.5501903295516968, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.022, "frac_reward_zero_std": 0.0, "grad_norm": 3.1417529582977295, "kl": 0.05178070580586791, "learning_rate": 4.3e-05, "loss": 0.0021, "num_tokens": 110123.0, "reward": -3.766486167907715, "reward_std": 2.3670270442962646, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.2401576042175293, "rewards/env_reward/std": 1.5196847915649414, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 14.666666984558105, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0225, "frac_reward_zero_std": 0.0, "grad_norm": 31.787551879882812, "kl": 0.1364445798099041, "learning_rate": 4.4000000000000006e-05, "loss": 0.0055, "num_tokens": 112599.0, "reward": -2.7055277824401855, "reward_std": 2.6139395236968994, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.12398147583007812, "rewards/belief_accuracy/std": 0.04796295985579491, "rewards/env_reward/mean": -1.5182223320007324, "rewards/env_reward/std": 1.736833095550537, "rewards/format_valid/mean": -0.625, "rewards/format_valid/std": 1.6007810831069946, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 20.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.023, "frac_reward_zero_std": 0.0, "grad_norm": 809.3003540039062, "kl": 1.457309697754681, "learning_rate": 4.5e-05, "loss": 0.0583, "num_tokens": 115104.0, "reward": -2.267341136932373, "reward_std": 3.0976674556732178, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.2698941230773926, "rewards/env_reward/std": 1.9977540969848633, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 14.666666984558105, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0235, "frac_reward_zero_std": 1.0, "grad_norm": 17.53619384765625, "kl": 1.0537898712791502, "learning_rate": 4.600000000000001e-05, "loss": 0.0422, "num_tokens": 117580.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 14.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.024, "frac_reward_zero_std": 0.0, "grad_norm": 187.8842010498047, "kl": 0.3121867855079472, "learning_rate": 4.7e-05, "loss": 0.0125, "num_tokens": 120073.0, "reward": -3.6274335384368896, "reward_std": 2.645132541656494, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.1474556922912598, "rewards/env_reward/std": 1.705088496208191, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0245, "frac_reward_zero_std": 0.0, "grad_norm": 18.377052307128906, "kl": 0.10642453748732805, "learning_rate": 4.8e-05, "loss": 0.0043, "num_tokens": 122588.0, "reward": -3.7050957679748535, "reward_std": 2.4898080825805664, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.199230432510376, "rewards/env_reward/std": 1.6015390157699585, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 27.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.025, "frac_reward_zero_std": 0.0, "grad_norm": 8.019148826599121, "kl": 0.056871576234698296, "learning_rate": 4.9e-05, "loss": 0.0023, "num_tokens": 125111.0, "reward": -3.6691508293151855, "reward_std": 2.5616979598999023, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.175267219543457, "rewards/env_reward/std": 1.6494653224945068, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 26.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0255, "frac_reward_zero_std": 0.0, "grad_norm": 13.499008178710938, "kl": 0.0715335039421916, "learning_rate": 5e-05, "loss": 0.0029, "num_tokens": 127628.0, "reward": -2.420839786529541, "reward_std": 2.920422315597534, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.372226595878601, "rewards/env_reward/std": 1.8795907497406006, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.026, "frac_reward_zero_std": 0.0, "grad_norm": 5.560655117034912, "kl": 0.017654206603765488, "learning_rate": 4.9888888888888894e-05, "loss": 0.0007, "num_tokens": 130156.0, "reward": -4.051011085510254, "reward_std": 1.7979769706726074, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.4298410415649414, "rewards/env_reward/std": 1.1403180360794067, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.0265, "frac_reward_zero_std": 0.0, "grad_norm": 18.195871353149414, "kl": 0.24096931191161275, "learning_rate": 4.977777777777778e-05, "loss": 0.0096, "num_tokens": 132651.0, "reward": -2.6885905265808105, "reward_std": 2.6208035945892334, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.5507268905639648, "rewards/env_reward/std": 1.6801002025604248, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 17.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.027, "frac_reward_zero_std": 1.0, "grad_norm": 123.76203155517578, "kl": 5.633732934948057, "learning_rate": 4.966666666666667e-05, "loss": 0.2253, "num_tokens": 135150.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 27.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0275, "frac_reward_zero_std": 0.0, "grad_norm": 11.095720291137695, "kl": 0.04261765070259571, "learning_rate": 4.955555555555556e-05, "loss": 0.0017, "num_tokens": 137669.0, "reward": -3.5939033031463623, "reward_std": 2.712193012237549, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.1251022815704346, "rewards/env_reward/std": 1.7497954368591309, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.028, "frac_reward_zero_std": 0.0, "grad_norm": 7.206400394439697, "kl": 0.0018149956013076007, "learning_rate": 4.9444444444444446e-05, "loss": 0.0001, "num_tokens": 140095.0, "reward": -0.6511554718017578, "reward_std": 0.4664153754711151, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.2507702708244324, "rewards/env_reward/std": 0.3109435439109802, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 17.666667938232422, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0285, "frac_reward_zero_std": 0.0, "grad_norm": 9.72509479522705, "kl": 0.03556834487244487, "learning_rate": 4.933333333333334e-05, "loss": 0.0014, "num_tokens": 142580.0, "reward": -1.1361982822418213, "reward_std": 2.5431196689605713, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5449655055999756, "rewards/env_reward/std": 1.6370937824249268, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.029, "frac_reward_zero_std": 0.0, "grad_norm": 69.71238708496094, "kl": 0.12905889004468918, "learning_rate": 4.922222222222222e-05, "loss": 0.0052, "num_tokens": 145053.0, "reward": -3.697530746459961, "reward_std": 2.5049378871917725, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.1941874027252197, "rewards/env_reward/std": 1.61162531375885, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0295, "frac_reward_zero_std": 0.0, "grad_norm": 4.306075572967529, "kl": 0.05293075350346044, "learning_rate": 4.9111111111111114e-05, "loss": 0.0021, "num_tokens": 147515.0, "reward": -0.8994538187980652, "reward_std": 0.14630256593227386, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.165370374917984, "rewards/belief_accuracy/std": 0.04776628687977791, "rewards/env_reward/mean": -0.30222848057746887, "rewards/env_reward/std": 0.13543139398097992, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.03, "frac_reward_zero_std": 0.0, "grad_norm": 5.006049633026123, "kl": 0.024995889314595843, "learning_rate": 4.9e-05, "loss": 0.001, "num_tokens": 149950.0, "reward": -0.2717297375202179, "reward_std": 0.27656516432762146, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.03703703731298447, "rewards/belief_accuracy/std": 0.04781460762023926, "rewards/env_reward/mean": -0.14041242003440857, "rewards/env_reward/std": 0.21106119453907013, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0305, "frac_reward_zero_std": 1.0, "grad_norm": 1.1231383085250854, "kl": 0.25201990082859993, "learning_rate": 4.888888888888889e-05, "loss": 0.0101, "num_tokens": 152433.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.031, "frac_reward_zero_std": 0.0, "grad_norm": 14.847359657287598, "kl": 0.3126356555148959, "learning_rate": 4.8777777777777775e-05, "loss": 0.0125, "num_tokens": 154901.0, "reward": -1.7808257341384888, "reward_std": 3.659447193145752, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.9455505609512329, "rewards/env_reward/std": 2.372274160385132, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0315, "frac_reward_zero_std": 0.0, "grad_norm": 11.686541557312012, "kl": 0.153579062782228, "learning_rate": 4.866666666666667e-05, "loss": 0.0061, "num_tokens": 157399.0, "reward": -2.4691736698150635, "reward_std": 2.869215488433838, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.404449224472046, "rewards/env_reward/std": 1.845564842224121, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 20.33333396911621, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.032, "frac_reward_zero_std": 0.0, "grad_norm": 92.96546936035156, "kl": 0.2878706678748131, "learning_rate": 4.855555555555556e-05, "loss": 0.0115, "num_tokens": 159892.0, "reward": -3.7632439136505127, "reward_std": 2.373511791229248, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.2379961013793945, "rewards/env_reward/std": 1.5240079164505005, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 26.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.0325, "frac_reward_zero_std": 0.0, "grad_norm": 36.191368103027344, "kl": 0.35182441864162683, "learning_rate": 4.844444444444445e-05, "loss": 0.0141, "num_tokens": 162402.0, "reward": -2.2611498832702637, "reward_std": 3.114436149597168, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10736748576164246, "rewards/belief_accuracy/std": 0.014734972268342972, "rewards/env_reward/mean": -1.255198359489441, "rewards/env_reward/std": 2.0199925899505615, "rewards/format_valid/mean": -0.625, "rewards/format_valid/std": 1.6007810831069946, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 11.666666984558105, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.033, "frac_reward_zero_std": 0.0, "grad_norm": 228884.828125, "kl": 468.09647609852254, "learning_rate": 4.8333333333333334e-05, "loss": 18.7239, "num_tokens": 164869.0, "reward": -3.877704381942749, "reward_std": 2.1445908546447754, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.314302921295166, "rewards/env_reward/std": 1.3713939189910889, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.0335, "frac_reward_zero_std": 0.0, "grad_norm": 4.94432258605957, "kl": 0.21442949026823044, "learning_rate": 4.8222222222222225e-05, "loss": 0.0086, "num_tokens": 167387.0, "reward": -3.75144624710083, "reward_std": 2.397106885910034, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.230130910873413, "rewards/env_reward/std": 1.5397380590438843, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 8.333333969116211, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.034, "frac_reward_zero_std": 0.0, "grad_norm": 10.818193435668945, "kl": 0.9112066635861993, "learning_rate": 4.811111111111111e-05, "loss": 0.0364, "num_tokens": 169620.0, "reward": 0.4229079484939575, "reward_std": 0.2314292937517166, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.24860529601573944, "rewards/env_reward/std": 0.154286190867424, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 13.333333969116211, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0345, "frac_reward_zero_std": 0.0, "grad_norm": 25.480619430541992, "kl": 2.5357193499803543, "learning_rate": 4.8e-05, "loss": 0.1014, "num_tokens": 172092.0, "reward": -2.201890468597412, "reward_std": 3.173243284225464, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.2262604236602783, "rewards/env_reward/std": 2.04813814163208, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 22.666667938232422, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.035, "frac_reward_zero_std": 0.0, "grad_norm": 8.468518257141113, "kl": 0.5803861692547798, "learning_rate": 4.7888888888888886e-05, "loss": 0.0232, "num_tokens": 174592.0, "reward": -2.594465732574463, "reward_std": 2.7201738357543945, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.0878773033618927, "rewards/belief_accuracy/std": 0.024245386943221092, "rewards/env_reward/mean": -1.5163891315460205, "rewards/env_reward/std": 1.7145698070526123, "rewards/format_valid/mean": -0.625, "rewards/format_valid/std": 1.6007810831069946, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 22.666667938232422, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0355, "frac_reward_zero_std": 0.0, "grad_norm": 46.88726806640625, "kl": 0.7163544222712517, "learning_rate": 4.7777777777777784e-05, "loss": 0.0287, "num_tokens": 177092.0, "reward": -2.0240089893341064, "reward_std": 3.3790602684020996, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.1076725721359253, "rewards/env_reward/std": 2.1853580474853516, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 19.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.036, "frac_reward_zero_std": 0.0, "grad_norm": 4.72477388381958, "kl": 0.7021452663466334, "learning_rate": 4.766666666666667e-05, "loss": 0.0281, "num_tokens": 179581.0, "reward": -1.4259536266326904, "reward_std": 2.3681116104125977, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.7381357550621033, "rewards/env_reward/std": 1.5208872556686401, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0365, "frac_reward_zero_std": 0.0, "grad_norm": 26.196685791015625, "kl": 0.6366847828030586, "learning_rate": 4.755555555555556e-05, "loss": 0.0255, "num_tokens": 182076.0, "reward": -1.664202332496643, "reward_std": 2.311755418777466, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.08500000089406967, "rewards/belief_accuracy/std": 0.030000001192092896, "rewards/env_reward/mean": -0.9311348795890808, "rewards/env_reward/std": 1.4874457120895386, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.037, "frac_reward_zero_std": 0.0, "grad_norm": 38.14860916137695, "kl": 1.0752212293446064, "learning_rate": 4.7444444444444445e-05, "loss": 0.043, "num_tokens": 184544.0, "reward": -1.49713134765625, "reward_std": 2.301912307739258, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.7855876684188843, "rewards/env_reward/std": 1.476274847984314, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.0375, "frac_reward_zero_std": 0.0, "grad_norm": 3.1271560192108154, "kl": 0.22124752588570118, "learning_rate": 4.7333333333333336e-05, "loss": 0.0088, "num_tokens": 187072.0, "reward": -2.7508177757263184, "reward_std": 2.5399067401885986, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.5922119617462158, "rewards/env_reward/std": 1.6259276866912842, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.038, "frac_reward_zero_std": 0.0, "grad_norm": 5.755521774291992, "kl": 0.16726691462099552, "learning_rate": 4.722222222222222e-05, "loss": 0.0067, "num_tokens": 189600.0, "reward": -3.8879446983337402, "reward_std": 2.124109983444214, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.3211300373077393, "rewards/env_reward/std": 1.357740044593811, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.0385, "frac_reward_zero_std": 1.0, "grad_norm": 0.24942930042743683, "kl": 0.789710771292448, "learning_rate": 4.711111111111111e-05, "loss": 0.0316, "num_tokens": 192108.0, "reward": -4.949999809265137, "reward_std": 0.0, "rewards/action_legal/mean": -1.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -3.0, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": -2.0, "rewards/format_valid/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 11.666666984558105, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.039, "frac_reward_zero_std": 0.0, "grad_norm": 4.9244184494018555, "kl": 1.1887651532888412, "learning_rate": 4.7e-05, "loss": 0.0476, "num_tokens": 194575.0, "reward": -1.6062259674072266, "reward_std": 2.2595274448394775, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.08573612570762634, "rewards/belief_accuracy/std": 0.028527740389108658, "rewards/env_reward/mean": -0.8910117149353027, "rewards/env_reward/std": 1.4291024208068848, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 21.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0395, "frac_reward_zero_std": 0.0, "grad_norm": 4.589632034301758, "kl": 0.914489395916462, "learning_rate": 4.6888888888888895e-05, "loss": 0.0366, "num_tokens": 197081.0, "reward": -1.3840163946151733, "reward_std": 2.4131362438201904, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": -0.681010901927948, "rewards/env_reward/std": 1.562245488166809, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 19.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 23.94391441345215, "kl": 0.7511968985199928, "learning_rate": 4.677777777777778e-05, "loss": 0.03, "num_tokens": 199584.0, "reward": -1.7008922100067139, "reward_std": 2.489635705947876, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.9214280843734741, "rewards/env_reward/std": 1.6092621088027954, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0405, "frac_reward_zero_std": 0.0, "grad_norm": 14.980483055114746, "kl": 1.2917132005095482, "learning_rate": 4.666666666666667e-05, "loss": 0.0517, "num_tokens": 202050.0, "reward": -1.1099207401275635, "reward_std": 2.573901891708374, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5274472236633301, "rewards/env_reward/std": 1.6579262018203735, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.041, "frac_reward_zero_std": 0.0, "grad_norm": 18.656517028808594, "kl": 1.1007941216230392, "learning_rate": 4.6555555555555556e-05, "loss": 0.044, "num_tokens": 204526.0, "reward": -0.9679015278816223, "reward_std": 2.6547322273254395, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.43276768922805786, "rewards/env_reward/std": 1.7114882469177246, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 15.666666984558105, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0415, "frac_reward_zero_std": 0.0, "grad_norm": 6.2242937088012695, "kl": 1.004544973373413, "learning_rate": 4.644444444444445e-05, "loss": 0.0402, "num_tokens": 207005.0, "reward": -2.1635308265686035, "reward_std": 3.2175371646881104, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.200687289237976, "rewards/env_reward/std": 2.077667474746704, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.042, "frac_reward_zero_std": 0.0, "grad_norm": 14.178193092346191, "kl": 0.9078696174547076, "learning_rate": 4.633333333333333e-05, "loss": 0.0363, "num_tokens": 209475.0, "reward": 0.19402220845222473, "reward_std": 0.2724432051181793, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.00833333283662796, "rewards/belief_accuracy/std": 0.10671874135732651, "rewards/env_reward/mean": 0.08351479470729828, "rewards/env_reward/std": 0.14911670982837677, "rewards/format_valid/mean": 0.875, "rewards/format_valid/std": 0.25, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.0425, "frac_reward_zero_std": 0.0, "grad_norm": 7.778254508972168, "kl": 1.0725902691483498, "learning_rate": 4.6222222222222224e-05, "loss": 0.0429, "num_tokens": 211956.0, "reward": -0.8444531559944153, "reward_std": 2.7435097694396973, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.0925000011920929, "rewards/belief_accuracy/std": 0.015000000596046448, "rewards/env_reward/mean": -0.3696354031562805, "rewards/env_reward/std": 1.7566334009170532, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.043, "frac_reward_zero_std": 0.0, "grad_norm": 4.498073101043701, "kl": 0.7945144101977348, "learning_rate": 4.6111111111111115e-05, "loss": 0.0318, "num_tokens": 214458.0, "reward": -0.4258846640586853, "reward_std": 0.525246798992157, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.07500000298023224, "rewards/belief_accuracy/std": 0.05000000074505806, "rewards/env_reward/mean": -0.15475642681121826, "rewards/env_reward/std": 0.276262104511261, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 15.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0435, "frac_reward_zero_std": 0.0, "grad_norm": 5.651465892791748, "kl": 1.144854974001646, "learning_rate": 4.600000000000001e-05, "loss": 0.0458, "num_tokens": 216952.0, "reward": -2.7960398197174072, "reward_std": 2.4879508018493652, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10818149149417877, "rewards/belief_accuracy/std": 0.016362976282835007, "rewards/env_reward/mean": -1.610163688659668, "rewards/env_reward/std": 1.6060127019882202, "rewards/format_valid/mean": -0.625, "rewards/format_valid/std": 1.6007810831069946, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 17.666667938232422, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.044, "frac_reward_zero_std": 0.0, "grad_norm": 4.523915767669678, "kl": 1.0343455001711845, "learning_rate": 4.588888888888889e-05, "loss": 0.0414, "num_tokens": 219437.0, "reward": -2.3472089767456055, "reward_std": 3.005443811416626, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.3231394290924072, "rewards/env_reward/std": 1.9362717866897583, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0445, "frac_reward_zero_std": 0.0, "grad_norm": 3.352187395095825, "kl": 1.272004920989275, "learning_rate": 4.577777777777778e-05, "loss": 0.0509, "num_tokens": 221904.0, "reward": -1.3286750316619873, "reward_std": 2.4145350456237793, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": -0.6441167593002319, "rewards/env_reward/std": 1.5708537101745605, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.045, "frac_reward_zero_std": 0.0, "grad_norm": 3.992652654647827, "kl": 1.1837435215711594, "learning_rate": 4.566666666666667e-05, "loss": 0.0473, "num_tokens": 224393.0, "reward": -0.807397723197937, "reward_std": 2.8079702854156494, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.3299318850040436, "rewards/env_reward/std": 1.8133907318115234, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0455, "frac_reward_zero_std": 0.0, "grad_norm": 5.360654354095459, "kl": 1.3510248363018036, "learning_rate": 4.555555555555556e-05, "loss": 0.054, "num_tokens": 226870.0, "reward": 0.35237032175064087, "reward_std": 1.0852247476577759, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.418246865272522, "rewards/env_reward/std": 0.723483145236969, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 19.666667938232422, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.046, "frac_reward_zero_std": 0.0, "grad_norm": 3.634124517440796, "kl": 0.8390218988060951, "learning_rate": 4.5444444444444444e-05, "loss": 0.0336, "num_tokens": 229361.0, "reward": -2.2453417778015137, "reward_std": 3.167144298553467, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.25522780418396, "rewards/env_reward/std": 2.0450401306152344, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0465, "frac_reward_zero_std": 1.0, "grad_norm": 0.345480352640152, "kl": 1.7472785264253616, "learning_rate": 4.5333333333333335e-05, "loss": 0.0699, "num_tokens": 231818.0, "reward": 0.20606237649917603, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3207082748413086, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.047, "frac_reward_zero_std": 0.0, "grad_norm": 4.007046222686768, "kl": 1.7106561437249184, "learning_rate": 4.522222222222223e-05, "loss": 0.0684, "num_tokens": 234305.0, "reward": -1.3136588335037231, "reward_std": 2.4297609329223633, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6632725596427917, "rewards/env_reward/std": 1.5616451501846313, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.0475, "frac_reward_zero_std": 0.0, "grad_norm": 6.215799808502197, "kl": 2.4182121604681015, "learning_rate": 4.511111111111112e-05, "loss": 0.0967, "num_tokens": 236746.0, "reward": -1.4073553085327148, "reward_std": 2.4502437114715576, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.7257369160652161, "rewards/env_reward/std": 1.5773454904556274, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.048, "frac_reward_zero_std": 0.0, "grad_norm": 2.8741865158081055, "kl": 1.375985711812973, "learning_rate": 4.5e-05, "loss": 0.055, "num_tokens": 239220.0, "reward": -1.8366073369979858, "reward_std": 2.075605630874634, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.0119048357009888, "rewards/env_reward/std": 1.325404167175293, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 12.666666984558105, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0485, "frac_reward_zero_std": 0.0, "grad_norm": 2.7931768894195557, "kl": 1.1252032294869423, "learning_rate": 4.4888888888888894e-05, "loss": 0.045, "num_tokens": 241690.0, "reward": -0.21447324752807617, "reward_std": 0.08092716336250305, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.040351178497076035, "rewards/env_reward/std": 0.053951445966959, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.049, "frac_reward_zero_std": 0.0, "grad_norm": 3.6995668411254883, "kl": 0.2477953266352415, "learning_rate": 4.477777777777778e-05, "loss": 0.0099, "num_tokens": 243898.0, "reward": 0.10004599392414093, "reward_std": 0.12990380823612213, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.03336399421095848, "rewards/env_reward/std": 0.08660253882408142, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0495, "frac_reward_zero_std": 0.0, "grad_norm": 3.9689114093780518, "kl": 1.3716598898172379, "learning_rate": 4.466666666666667e-05, "loss": 0.0549, "num_tokens": 246394.0, "reward": 0.05885888263583183, "reward_std": 0.17086723446846008, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11083333194255829, "rewards/belief_accuracy/std": 0.021666666492819786, "rewards/env_reward/mean": 0.2400725781917572, "rewards/env_reward/std": 0.08001596480607986, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.05, "frac_reward_zero_std": 0.0, "grad_norm": 6.792644500732422, "kl": 1.8348833322525024, "learning_rate": 4.4555555555555555e-05, "loss": 0.0734, "num_tokens": 248839.0, "reward": 0.2527257204055786, "reward_std": 0.15090236067771912, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.35181713104248047, "rewards/env_reward/std": 0.10060158371925354, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0505, "frac_reward_zero_std": 0.0, "grad_norm": 2.8620173931121826, "kl": 1.4036446511745453, "learning_rate": 4.4444444444444447e-05, "loss": 0.0561, "num_tokens": 251307.0, "reward": -1.31059730052948, "reward_std": 2.4275147914886475, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.1158333271741867, "rewards/belief_accuracy/std": 0.03166666254401207, "rewards/env_reward/mean": -0.6337315440177917, "rewards/env_reward/std": 1.577512264251709, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.051, "frac_reward_zero_std": 0.0, "grad_norm": 2.1169137954711914, "kl": 1.8475644141435623, "learning_rate": 4.433333333333334e-05, "loss": 0.0739, "num_tokens": 253767.0, "reward": -1.3670077323913574, "reward_std": 2.399441719055176, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6988385319709778, "rewards/env_reward/std": 1.5415664911270142, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0515, "frac_reward_zero_std": 0.0, "grad_norm": 3.684938430786133, "kl": 2.059985037893057, "learning_rate": 4.422222222222222e-05, "loss": 0.0824, "num_tokens": 256198.0, "reward": -1.4205896854400635, "reward_std": 2.3623111248016357, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.7345598340034485, "rewards/env_reward/std": 1.516780972480774, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.052, "frac_reward_zero_std": 0.0, "grad_norm": 3.5143520832061768, "kl": 1.3098777011036873, "learning_rate": 4.4111111111111114e-05, "loss": 0.0524, "num_tokens": 258632.0, "reward": -0.0835796445608139, "reward_std": 0.2586938738822937, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.0363982617855072, "rewards/belief_accuracy/std": 0.11159241199493408, "rewards/env_reward/mean": -0.016256578266620636, "rewards/env_reward/std": 0.26623615622520447, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0525, "frac_reward_zero_std": 0.0, "grad_norm": 6.2300004959106445, "kl": 1.785375103354454, "learning_rate": 4.4000000000000006e-05, "loss": 0.0714, "num_tokens": 261090.0, "reward": -0.08198876678943634, "reward_std": 0.8859658241271973, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.12867416441440582, "rewards/env_reward/std": 0.5906438827514648, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.053, "frac_reward_zero_std": 0.0, "grad_norm": 2.8233327865600586, "kl": 1.1763433814048767, "learning_rate": 4.388888888888889e-05, "loss": 0.0471, "num_tokens": 263555.0, "reward": -2.6301207542419434, "reward_std": 2.697817087173462, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.511747121810913, "rewards/env_reward/std": 1.7316814661026, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0535, "frac_reward_zero_std": 0.0, "grad_norm": 5.215397357940674, "kl": 1.5616333931684494, "learning_rate": 4.377777777777778e-05, "loss": 0.0625, "num_tokens": 266021.0, "reward": -0.27485907077789307, "reward_std": 0.8942175507545471, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 9.395182132720947e-05, "rewards/env_reward/std": 0.5961450934410095, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.054, "frac_reward_zero_std": 0.0, "grad_norm": 4.469447612762451, "kl": 2.5851728469133377, "learning_rate": 4.3666666666666666e-05, "loss": 0.1034, "num_tokens": 268461.0, "reward": -1.1452138423919678, "reward_std": 2.582923650741577, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5509759187698364, "rewards/env_reward/std": 1.6647002696990967, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0545, "frac_reward_zero_std": 0.0, "grad_norm": 7.712915420532227, "kl": 2.6475657522678375, "learning_rate": 4.355555555555556e-05, "loss": 0.1059, "num_tokens": 270898.0, "reward": 0.12694786489009857, "reward_std": 0.002898484468460083, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.26796525716781616, "rewards/env_reward/std": 0.0019323229789733887, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.055, "frac_reward_zero_std": 0.0, "grad_norm": 7.97365140914917, "kl": 2.896424412727356, "learning_rate": 4.344444444444445e-05, "loss": 0.1159, "num_tokens": 273372.0, "reward": -0.7330765128135681, "reward_std": 0.20495304465293884, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.06111111119389534, "rewards/belief_accuracy/std": 0.07777778059244156, "rewards/env_reward/mean": -0.38732877373695374, "rewards/env_reward/std": 0.12224072217941284, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0555, "frac_reward_zero_std": 0.0, "grad_norm": 4.209381580352783, "kl": 1.7662896811962128, "learning_rate": 4.3333333333333334e-05, "loss": 0.0707, "num_tokens": 275814.0, "reward": 0.027455374598503113, "reward_std": 0.4914630353450775, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.2016369253396988, "rewards/env_reward/std": 0.3276420533657074, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.056, "frac_reward_zero_std": 0.0, "grad_norm": 3.021775960922241, "kl": 2.54108564555645, "learning_rate": 4.3222222222222226e-05, "loss": 0.1016, "num_tokens": 278282.0, "reward": -0.2888333201408386, "reward_std": 0.15247361361980438, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.009222209453582764, "rewards/env_reward/std": 0.10164907574653625, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 12.666666984558105, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0565, "frac_reward_zero_std": 0.0, "grad_norm": 4.275335311889648, "kl": 1.6538867950439453, "learning_rate": 4.311111111111111e-05, "loss": 0.0662, "num_tokens": 280752.0, "reward": 0.437593936920166, "reward_std": 0.39731013774871826, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.47506263852119446, "rewards/env_reward/std": 0.2648734450340271, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.057, "frac_reward_zero_std": 0.0, "grad_norm": 6.433754920959473, "kl": 1.647656962275505, "learning_rate": 4.3e-05, "loss": 0.0659, "num_tokens": 283204.0, "reward": -1.3276481628417969, "reward_std": 2.414963722229004, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6725987792015076, "rewards/env_reward/std": 1.551644206047058, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 10.333333969116211, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0575, "frac_reward_zero_std": 0.0, "grad_norm": 2.6330208778381348, "kl": 1.515267439186573, "learning_rate": 4.2888888888888886e-05, "loss": 0.0606, "num_tokens": 285667.0, "reward": -3.7085390090942383, "reward_std": 2.482921838760376, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.201525926589966, "rewards/env_reward/std": 1.5969480276107788, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.058, "frac_reward_zero_std": 0.0, "grad_norm": 2.3021349906921387, "kl": 1.7087249606847763, "learning_rate": 4.277777777777778e-05, "loss": 0.0683, "num_tokens": 288114.0, "reward": 0.5330584049224854, "reward_std": 0.35837167501449585, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5387057065963745, "rewards/env_reward/std": 0.23891450464725494, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.0585, "frac_reward_zero_std": 0.0, "grad_norm": 5.497950553894043, "kl": 2.1271141320466995, "learning_rate": 4.266666666666667e-05, "loss": 0.0851, "num_tokens": 290540.0, "reward": 0.4702581763267517, "reward_std": 0.44036781787872314, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.4968388080596924, "rewards/env_reward/std": 0.293578565120697, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.059, "frac_reward_zero_std": 0.0, "grad_norm": 4.789796829223633, "kl": 1.4464631527662277, "learning_rate": 4.255555555555556e-05, "loss": 0.0579, "num_tokens": 293023.0, "reward": -0.7475403547286987, "reward_std": 2.8080575466156006, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.0703703761100769, "rewards/belief_accuracy/std": 0.059259265661239624, "rewards/env_reward/mean": -0.3492862284183502, "rewards/env_reward/std": 1.7720966339111328, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0595, "frac_reward_zero_std": 1.0, "grad_norm": 1.5602530241012573, "kl": 1.6833766214549541, "learning_rate": 4.2444444444444445e-05, "loss": 0.0673, "num_tokens": 295460.0, "reward": -0.42602595686912537, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.10068397223949432, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.06, "frac_reward_zero_std": 0.0, "grad_norm": 4.7347092628479, "kl": 1.445710226893425, "learning_rate": 4.233333333333334e-05, "loss": 0.0578, "num_tokens": 297907.0, "reward": -1.0152814388275146, "reward_std": 2.623145580291748, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.4643542170524597, "rewards/env_reward/std": 1.6904305219650269, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0605, "frac_reward_zero_std": 0.0, "grad_norm": 6.493462085723877, "kl": 1.4399118982255459, "learning_rate": 4.222222222222222e-05, "loss": 0.0576, "num_tokens": 300352.0, "reward": -0.20259986817836761, "reward_std": 0.06754998862743378, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.048266757279634476, "rewards/env_reward/std": 0.04503332078456879, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.061, "frac_reward_zero_std": 0.0, "grad_norm": 2.4330313205718994, "kl": 1.4334093481302261, "learning_rate": 4.211111111111111e-05, "loss": 0.0573, "num_tokens": 302825.0, "reward": -0.5075480937957764, "reward_std": 0.6749432682991028, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.15503208339214325, "rewards/env_reward/std": 0.44996219873428345, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.0615, "frac_reward_zero_std": 0.0, "grad_norm": 4.5208353996276855, "kl": 1.7931447178125381, "learning_rate": 4.2e-05, "loss": 0.0717, "num_tokens": 305291.0, "reward": -2.456667900085449, "reward_std": 2.895482063293457, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.3961119651794434, "rewards/env_reward/std": 1.863360047340393, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.062, "frac_reward_zero_std": 0.0, "grad_norm": 2.7689969539642334, "kl": 0.8019402623176575, "learning_rate": 4.188888888888889e-05, "loss": 0.0321, "num_tokens": 307775.0, "reward": -1.4924118518829346, "reward_std": 2.360426187515259, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.782441258430481, "rewards/env_reward/std": 1.5167045593261719, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0625, "frac_reward_zero_std": 0.0, "grad_norm": 3.6446032524108887, "kl": 1.4752652198076248, "learning_rate": 4.177777777777778e-05, "loss": 0.059, "num_tokens": 310203.0, "reward": 0.8273366689682007, "reward_std": 0.6383920311927795, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.7348911762237549, "rewards/env_reward/std": 0.4255947172641754, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.063, "frac_reward_zero_std": 0.0, "grad_norm": 5.112602233886719, "kl": 1.6532337069511414, "learning_rate": 4.166666666666667e-05, "loss": 0.0661, "num_tokens": 312653.0, "reward": -0.10410824418067932, "reward_std": 0.01273045688867569, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.11392784118652344, "rewards/env_reward/std": 0.008486974984407425, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0635, "frac_reward_zero_std": 0.0, "grad_norm": 4.351461887359619, "kl": 2.1480718851089478, "learning_rate": 4.155555555555556e-05, "loss": 0.0859, "num_tokens": 315110.0, "reward": -0.3739127516746521, "reward_std": 0.14415279030799866, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.06594181805849075, "rewards/env_reward/std": 0.09610186517238617, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.064, "frac_reward_zero_std": 0.0, "grad_norm": 3.8212032318115234, "kl": 1.534121721982956, "learning_rate": 4.144444444444445e-05, "loss": 0.0614, "num_tokens": 317584.0, "reward": -0.19062533974647522, "reward_std": 0.3150945007801056, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.0893678218126297, "rewards/belief_accuracy/std": 0.021264348179101944, "rewards/env_reward/mean": 0.03081876039505005, "rewards/env_reward/std": 0.24891482293605804, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0645, "frac_reward_zero_std": 0.0, "grad_norm": 4.938203811645508, "kl": 2.1303387582302094, "learning_rate": 4.133333333333333e-05, "loss": 0.0852, "num_tokens": 320029.0, "reward": -1.1215572357177734, "reward_std": 2.5629398822784424, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.06083333492279053, "rewards/belief_accuracy/std": 0.07833334058523178, "rewards/env_reward/mean": -0.6177048683166504, "rewards/env_reward/std": 1.588196873664856, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.065, "frac_reward_zero_std": 0.0, "grad_norm": 5.400176048278809, "kl": 1.8505046516656876, "learning_rate": 4.1222222222222224e-05, "loss": 0.074, "num_tokens": 322471.0, "reward": -0.12519629299640656, "reward_std": 0.14092496037483215, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.09986913949251175, "rewards/env_reward/std": 0.09394997358322144, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.0655, "frac_reward_zero_std": 0.0, "grad_norm": 3.194300413131714, "kl": 1.2793779149651527, "learning_rate": 4.111111111111111e-05, "loss": 0.0512, "num_tokens": 324940.0, "reward": -1.0001001358032227, "reward_std": 2.700178623199463, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.45423343777656555, "rewards/env_reward/std": 1.743279218673706, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.066, "frac_reward_zero_std": 0.0, "grad_norm": 4.973412036895752, "kl": 1.9231543093919754, "learning_rate": 4.1e-05, "loss": 0.0769, "num_tokens": 327385.0, "reward": -1.7763125896453857, "reward_std": 2.129706382751465, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.9717084169387817, "rewards/env_reward/std": 1.3618686199188232, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 7.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0665, "frac_reward_zero_std": 0.0, "grad_norm": 2.070490598678589, "kl": 1.150221362709999, "learning_rate": 4.088888888888889e-05, "loss": 0.046, "num_tokens": 329838.0, "reward": 0.4534025192260742, "reward_std": 1.0941553115844727, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.4856016933917999, "rewards/env_reward/std": 0.7294369339942932, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 10.333333969116211, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.067, "frac_reward_zero_std": 0.0, "grad_norm": 1.878156065940857, "kl": 1.4795889034867287, "learning_rate": 4.0777777777777783e-05, "loss": 0.0592, "num_tokens": 332301.0, "reward": -2.0403780937194824, "reward_std": 3.3793559074401855, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.1185853481292725, "rewards/env_reward/std": 2.1859495639801025, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0675, "frac_reward_zero_std": 1.0, "grad_norm": 0.1908632516860962, "kl": 1.9114599525928497, "learning_rate": 4.066666666666667e-05, "loss": 0.0765, "num_tokens": 334745.0, "reward": 0.5449367761611938, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5466245412826538, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 10.666666984558105, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.068, "frac_reward_zero_std": 0.0, "grad_norm": 3.5480082035064697, "kl": 1.5377977713942528, "learning_rate": 4.055555555555556e-05, "loss": 0.0615, "num_tokens": 337209.0, "reward": -1.2076761722564697, "reward_std": 2.4953103065490723, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5926175117492676, "rewards/env_reward/std": 1.6052173376083374, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0685, "frac_reward_zero_std": 0.0, "grad_norm": 1.0419105291366577, "kl": 1.1934361532330513, "learning_rate": 4.0444444444444444e-05, "loss": 0.0477, "num_tokens": 339665.0, "reward": -1.1166263818740845, "reward_std": 2.606243848800659, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5319175720214844, "rewards/env_reward/std": 1.6803356409072876, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.069, "frac_reward_zero_std": 0.0, "grad_norm": 3.009521007537842, "kl": 2.4323032796382904, "learning_rate": 4.0333333333333336e-05, "loss": 0.0973, "num_tokens": 342138.0, "reward": 0.6563852429389954, "reward_std": 0.8735789656639099, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.057500001043081284, "rewards/belief_accuracy/std": 0.08499999344348907, "rewards/env_reward/mean": 0.5317568182945251, "rewards/env_reward/std": 0.6170323491096497, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.0695, "frac_reward_zero_std": 0.0, "grad_norm": 3.7213194370269775, "kl": 3.0655910074710846, "learning_rate": 4.022222222222222e-05, "loss": 0.1226, "num_tokens": 344573.0, "reward": 0.5249032378196716, "reward_std": 0.1243140697479248, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5332688093185425, "rewards/env_reward/std": 0.0828760415315628, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.07, "frac_reward_zero_std": 1.0, "grad_norm": 0.32563015818595886, "kl": 2.0985984057188034, "learning_rate": 4.011111111111111e-05, "loss": 0.0839, "num_tokens": 347006.0, "reward": -0.07693907618522644, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.1320406198501587, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0964287742972374, "kl": 1.2693939208984375, "learning_rate": 4e-05, "loss": 0.0508, "num_tokens": 349465.0, "reward": -0.02015012502670288, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.16989992558956146, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 17.666667938232422, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.071, "frac_reward_zero_std": 1.0, "grad_norm": 0.09562604129314423, "kl": 1.128716617822647, "learning_rate": 3.9888888888888895e-05, "loss": 0.0451, "num_tokens": 351950.0, "reward": 0.7584548592567444, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.6889699697494507, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0715, "frac_reward_zero_std": 0.0, "grad_norm": 4.745733261108398, "kl": 0.7463721930980682, "learning_rate": 3.977777777777778e-05, "loss": 0.0299, "num_tokens": 354455.0, "reward": 0.3327590823173523, "reward_std": 0.23017629981040955, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.405172735452652, "rewards/env_reward/std": 0.1534508764743805, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.072, "frac_reward_zero_std": 0.0, "grad_norm": 4.888156890869141, "kl": 1.4033671617507935, "learning_rate": 3.966666666666667e-05, "loss": 0.0561, "num_tokens": 356946.0, "reward": -0.13394379615783691, "reward_std": 0.41236963868141174, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.09403747320175171, "rewards/env_reward/std": 0.27491310238838196, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0725, "frac_reward_zero_std": 0.0, "grad_norm": 3.65702486038208, "kl": 1.6139360815286636, "learning_rate": 3.9555555555555556e-05, "loss": 0.0646, "num_tokens": 359381.0, "reward": -1.7238547801971436, "reward_std": 2.1609649658203125, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.12072296440601349, "rewards/belief_accuracy/std": 0.07126190513372421, "rewards/env_reward/mean": -0.9077907204627991, "rewards/env_reward/std": 1.3948062658309937, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.073, "frac_reward_zero_std": 0.0, "grad_norm": 2.8972437381744385, "kl": 1.160056695342064, "learning_rate": 3.944444444444445e-05, "loss": 0.0464, "num_tokens": 361855.0, "reward": 0.13612942397594452, "reward_std": 0.02916666865348816, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.0972222238779068, "rewards/belief_accuracy/std": 0.0055555556900799274, "rewards/env_reward/mean": 0.2643640637397766, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0735, "frac_reward_zero_std": 0.0, "grad_norm": 4.382909774780273, "kl": 1.568796619772911, "learning_rate": 3.933333333333333e-05, "loss": 0.0628, "num_tokens": 364331.0, "reward": 0.800137996673584, "reward_std": 0.3069959282875061, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.7167587280273438, "rewards/env_reward/std": 0.2046639323234558, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 19.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.074, "frac_reward_zero_std": 0.0, "grad_norm": 8.64746379852295, "kl": 0.8096725344657898, "learning_rate": 3.922222222222223e-05, "loss": 0.0324, "num_tokens": 366833.0, "reward": -0.15847638249397278, "reward_std": 1.3163249492645264, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.07768243551254272, "rewards/env_reward/std": 0.877549946308136, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0745, "frac_reward_zero_std": 0.0, "grad_norm": 5.473668098449707, "kl": 2.7600976526737213, "learning_rate": 3.9111111111111115e-05, "loss": 0.1104, "num_tokens": 369284.0, "reward": -2.9569857120513916, "reward_std": 2.3198444843292236, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.7296571731567383, "rewards/env_reward/std": 1.4797673225402832, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.075, "frac_reward_zero_std": 0.0, "grad_norm": 3.437213897705078, "kl": 1.6173148602247238, "learning_rate": 3.9000000000000006e-05, "loss": 0.0647, "num_tokens": 371770.0, "reward": 0.5740416049957275, "reward_std": 0.233365997672081, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.13980931043624878, "rewards/belief_accuracy/std": 0.07961863279342651, "rewards/env_reward/mean": 0.6414797306060791, "rewards/env_reward/std": 0.1132500022649765, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0755, "frac_reward_zero_std": 1.0, "grad_norm": 0.2341485172510147, "kl": 1.3624602407217026, "learning_rate": 3.888888888888889e-05, "loss": 0.0545, "num_tokens": 374253.0, "reward": 0.2103109359741211, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3235406279563904, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 23.33333396911621, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.076, "frac_reward_zero_std": 0.0, "grad_norm": 3.359372615814209, "kl": 1.1529072970151901, "learning_rate": 3.877777777777778e-05, "loss": 0.0461, "num_tokens": 376755.0, "reward": 0.4146992564201355, "reward_std": 0.46390998363494873, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.45979946851730347, "rewards/env_reward/std": 0.3092733323574066, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0765, "frac_reward_zero_std": 0.0, "grad_norm": 5.655915260314941, "kl": 1.3275744514539838, "learning_rate": 3.866666666666667e-05, "loss": 0.0531, "num_tokens": 379211.0, "reward": -0.0012441501021385193, "reward_std": 0.24833057820796967, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.23028355836868286, "rewards/belief_accuracy/std": 0.09518812596797943, "rewards/env_reward/mean": 0.42640429735183716, "rewards/env_reward/std": 0.10868140310049057, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.077, "frac_reward_zero_std": 0.0, "grad_norm": 3.993246555328369, "kl": 1.3614933341741562, "learning_rate": 3.855555555555556e-05, "loss": 0.0545, "num_tokens": 381699.0, "reward": -0.3912268280982971, "reward_std": 3.040301561355591, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": -0.01915118098258972, "rewards/env_reward/std": 1.9872325658798218, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0775, "frac_reward_zero_std": 1.0, "grad_norm": 0.0937976986169815, "kl": 1.2733041644096375, "learning_rate": 3.844444444444444e-05, "loss": 0.0509, "num_tokens": 384182.0, "reward": -0.1396826207637787, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.09021158516407013, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 21.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.078, "frac_reward_zero_std": 0.0, "grad_norm": 4.676441669464111, "kl": 0.9261074624955654, "learning_rate": 3.8333333333333334e-05, "loss": 0.037, "num_tokens": 386677.0, "reward": -1.3367525339126587, "reward_std": 2.411214828491211, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6786683797836304, "rewards/env_reward/std": 1.5492030382156372, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0785, "frac_reward_zero_std": 0.0, "grad_norm": 2.1922104358673096, "kl": 0.0721854604780674, "learning_rate": 3.8222222222222226e-05, "loss": 0.0029, "num_tokens": 389109.0, "reward": 0.8821967244148254, "reward_std": 0.15713486075401306, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.31481480598449707, "rewards/belief_accuracy/std": 0.052378278225660324, "rewards/env_reward/mean": 1.1844274997711182, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.079, "frac_reward_zero_std": 0.0, "grad_norm": 4.775229454040527, "kl": 1.4617139548063278, "learning_rate": 3.811111111111112e-05, "loss": 0.0585, "num_tokens": 391575.0, "reward": -0.6259194612503052, "reward_std": 0.5253891348838806, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10740740597248077, "rewards/belief_accuracy/std": 0.014814812690019608, "rewards/env_reward/mean": -0.22329813241958618, "rewards/env_reward/std": 0.3683049976825714, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 18.33333396911621, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0795, "frac_reward_zero_std": 0.0, "grad_norm": 2.7022299766540527, "kl": 1.3450734540820122, "learning_rate": 3.8e-05, "loss": 0.0538, "num_tokens": 394062.0, "reward": -1.487056851387024, "reward_std": 2.3527774810791016, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.0871676579117775, "rewards/belief_accuracy/std": 0.025664685294032097, "rewards/env_reward/mean": -0.8087027072906494, "rewards/env_reward/std": 1.5039740800857544, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 21.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.08, "frac_reward_zero_std": 0.0, "grad_norm": 4.860130310058594, "kl": 1.0795547626912594, "learning_rate": 3.7888888888888894e-05, "loss": 0.0432, "num_tokens": 396557.0, "reward": -0.01648128777742386, "reward_std": 0.3920603096485138, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.1723458170890808, "rewards/env_reward/std": 0.2613735496997833, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 18.666667938232422, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0805, "frac_reward_zero_std": 0.0, "grad_norm": 3.8319482803344727, "kl": 1.2551886662840843, "learning_rate": 3.777777777777778e-05, "loss": 0.0502, "num_tokens": 399045.0, "reward": 0.17859038710594177, "reward_std": 0.8184104561805725, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3023936152458191, "rewards/env_reward/std": 0.5456069707870483, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.081, "frac_reward_zero_std": 0.0, "grad_norm": 2.120696783065796, "kl": 0.8586160615086555, "learning_rate": 3.766666666666667e-05, "loss": 0.0343, "num_tokens": 401559.0, "reward": 0.4954003691673279, "reward_std": 0.3572309911251068, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10083333402872086, "rewards/belief_accuracy/std": 0.0016666651936247945, "rewards/env_reward/mean": 0.5111002922058105, "rewards/env_reward/std": 0.24086414277553558, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0815, "frac_reward_zero_std": 0.0, "grad_norm": 8.276159286499023, "kl": 2.0177499651908875, "learning_rate": 3.7555555555555554e-05, "loss": 0.0807, "num_tokens": 404022.0, "reward": -0.14451055228710175, "reward_std": 0.07916668057441711, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.08888889104127884, "rewards/belief_accuracy/std": 0.02222222276031971, "rewards/env_reward/mean": 0.06060408055782318, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.082, "frac_reward_zero_std": 0.0, "grad_norm": 5.653500080108643, "kl": 1.8464947640895844, "learning_rate": 3.7444444444444446e-05, "loss": 0.0739, "num_tokens": 406485.0, "reward": 0.27019041776657104, "reward_std": 0.23719999194145203, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11416666209697723, "rewards/belief_accuracy/std": 0.028333332389593124, "rewards/env_reward/mean": 0.3876269459724426, "rewards/env_reward/std": 0.10980000346899033, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 19.33333396911621, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0825, "frac_reward_zero_std": 1.0, "grad_norm": 0.24756693840026855, "kl": 1.6211326867341995, "learning_rate": 3.733333333333334e-05, "loss": 0.0648, "num_tokens": 408975.0, "reward": 0.03895732760429382, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.20930489897727966, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 19.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.083, "frac_reward_zero_std": 0.0, "grad_norm": 3.162034034729004, "kl": 1.216068983078003, "learning_rate": 3.722222222222222e-05, "loss": 0.0486, "num_tokens": 411464.0, "reward": -0.5613082051277161, "reward_std": 0.08749997615814209, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": -0.16170544922351837, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0835, "frac_reward_zero_std": 0.0, "grad_norm": 6.356938362121582, "kl": 2.8624762892723083, "learning_rate": 3.7111111111111113e-05, "loss": 0.1145, "num_tokens": 413907.0, "reward": 0.14997538924217224, "reward_std": 0.7540647387504578, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.2833169400691986, "rewards/env_reward/std": 0.5027098655700684, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.084, "frac_reward_zero_std": 0.0, "grad_norm": 4.261693477630615, "kl": 1.933813601732254, "learning_rate": 3.7e-05, "loss": 0.0774, "num_tokens": 416402.0, "reward": -0.05508837103843689, "reward_std": 0.11999882757663727, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.09027226269245148, "rewards/belief_accuracy/std": 0.019455470144748688, "rewards/env_reward/mean": 0.12298562377691269, "rewards/env_reward/std": 0.08219999819993973, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0845, "frac_reward_zero_std": 0.0, "grad_norm": 5.13167142868042, "kl": 2.3502594381570816, "learning_rate": 3.688888888888889e-05, "loss": 0.094, "num_tokens": 418887.0, "reward": 0.03691243380308151, "reward_std": 0.08749999105930328, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": 0.2371082901954651, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 15.333333969116211, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.085, "frac_reward_zero_std": 0.0, "grad_norm": 3.1629037857055664, "kl": 1.5842487215995789, "learning_rate": 3.677777777777778e-05, "loss": 0.0634, "num_tokens": 421365.0, "reward": 0.5254287719726562, "reward_std": 0.11898240447044373, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.533619225025177, "rewards/env_reward/std": 0.07932159304618835, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0855, "frac_reward_zero_std": 0.0, "grad_norm": 3.2538228034973145, "kl": 1.3759911209344864, "learning_rate": 3.6666666666666666e-05, "loss": 0.055, "num_tokens": 423851.0, "reward": 1.0300487279891968, "reward_std": 0.04658208787441254, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.1066666692495346, "rewards/belief_accuracy/std": 0.013333332724869251, "rewards/env_reward/mean": 0.8791991472244263, "rewards/env_reward/std": 0.01968872733414173, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.086, "frac_reward_zero_std": 1.0, "grad_norm": 0.15501928329467773, "kl": 1.7516742050647736, "learning_rate": 3.655555555555556e-05, "loss": 0.0701, "num_tokens": 426322.0, "reward": 0.030182331800460815, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.2034548968076706, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0865, "frac_reward_zero_std": 1.0, "grad_norm": 37.33258056640625, "kl": 8.386772617697716, "learning_rate": 3.644444444444445e-05, "loss": 0.3355, "num_tokens": 428763.0, "reward": -0.9025059342384338, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.41833725571632385, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.087, "frac_reward_zero_std": 1.0, "grad_norm": 0.22313672304153442, "kl": 1.8691215515136719, "learning_rate": 3.633333333333333e-05, "loss": 0.0748, "num_tokens": 431237.0, "reward": -0.16483666002750397, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.07344222813844681, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0875, "frac_reward_zero_std": 0.0, "grad_norm": 4.1793341636657715, "kl": 2.593918561935425, "learning_rate": 3.6222222222222225e-05, "loss": 0.1038, "num_tokens": 433710.0, "reward": -0.9837551116943359, "reward_std": 2.6553149223327637, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.44333672523498535, "rewards/env_reward/std": 1.712130069732666, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.088, "frac_reward_zero_std": 1.0, "grad_norm": 0.10414294898509979, "kl": 1.4019053727388382, "learning_rate": 3.611111111111111e-05, "loss": 0.0561, "num_tokens": 436184.0, "reward": -0.12919571995735168, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.0972028523683548, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0885, "frac_reward_zero_std": 0.0, "grad_norm": 3.139967441558838, "kl": 1.0769911333918571, "learning_rate": 3.6e-05, "loss": 0.0431, "num_tokens": 438657.0, "reward": 0.6967830657958984, "reward_std": 0.08670443296432495, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.6478554010391235, "rewards/env_reward/std": 0.05780297517776489, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 10.666666984558105, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.089, "frac_reward_zero_std": 0.0, "grad_norm": 3.0620081424713135, "kl": 1.0732092261314392, "learning_rate": 3.5888888888888886e-05, "loss": 0.0429, "num_tokens": 441121.0, "reward": -1.2789283990859985, "reward_std": 2.598663568496704, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6401189565658569, "rewards/env_reward/std": 1.6776195764541626, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0895, "frac_reward_zero_std": 1.0, "grad_norm": 0.18922147154808044, "kl": 0.8863924369215965, "learning_rate": 3.577777777777778e-05, "loss": 0.0355, "num_tokens": 443635.0, "reward": 1.347588062286377, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 1.0817253589630127, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.09, "frac_reward_zero_std": 0.0, "grad_norm": 3.3570539951324463, "kl": 1.9235362261533737, "learning_rate": 3.566666666666667e-05, "loss": 0.0769, "num_tokens": 446101.0, "reward": 0.05883501470088959, "reward_std": 0.5488622784614563, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.2225566804409027, "rewards/env_reward/std": 0.3659081757068634, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 17.33333396911621, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0905, "frac_reward_zero_std": 1.0, "grad_norm": 0.08723417669534683, "kl": 1.3284604251384735, "learning_rate": 3.555555555555556e-05, "loss": 0.0531, "num_tokens": 448585.0, "reward": 0.6743147373199463, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.6328765153884888, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 9.666666984558105, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.091, "frac_reward_zero_std": 0.0, "grad_norm": 3.3371059894561768, "kl": 1.4546705782413483, "learning_rate": 3.5444444444444445e-05, "loss": 0.0582, "num_tokens": 451046.0, "reward": -0.3032863438129425, "reward_std": 0.33148258924484253, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.018857555463910103, "rewards/env_reward/std": 0.22098839282989502, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0915, "frac_reward_zero_std": 0.0, "grad_norm": 2.911696195602417, "kl": 1.4696582406759262, "learning_rate": 3.5333333333333336e-05, "loss": 0.0588, "num_tokens": 453544.0, "reward": -2.4303359985351562, "reward_std": 2.9513607025146484, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.3785573244094849, "rewards/env_reward/std": 1.9012062549591064, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 13.333333969116211, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.092, "frac_reward_zero_std": 0.0, "grad_norm": 3.6932320594787598, "kl": 1.533248096704483, "learning_rate": 3.522222222222222e-05, "loss": 0.0613, "num_tokens": 456016.0, "reward": -1.5931193828582764, "reward_std": 3.8826847076416016, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.8204129934310913, "rewards/env_reward/std": 2.521214485168457, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0925, "frac_reward_zero_std": 0.0, "grad_norm": 4.075229167938232, "kl": 1.968793198466301, "learning_rate": 3.511111111111111e-05, "loss": 0.0788, "num_tokens": 458465.0, "reward": -0.23590603470802307, "reward_std": 0.2219301015138626, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.026062656193971634, "rewards/env_reward/std": 0.14795339107513428, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.093, "frac_reward_zero_std": 1.0, "grad_norm": 2.784778594970703, "kl": 2.159162014722824, "learning_rate": 3.5e-05, "loss": 0.0864, "num_tokens": 460898.0, "reward": -0.3167000114917755, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.027799999341368675, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 26.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.0935, "frac_reward_zero_std": 0.0, "grad_norm": 3.384214162826538, "kl": 1.2219679579138756, "learning_rate": 3.4888888888888895e-05, "loss": 0.0489, "num_tokens": 463415.0, "reward": -1.0766644477844238, "reward_std": 2.582223653793335, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5052763223648071, "rewards/env_reward/std": 1.6631492376327515, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.094, "frac_reward_zero_std": 0.0, "grad_norm": 3.3013856410980225, "kl": 1.2444797977805138, "learning_rate": 3.477777777777778e-05, "loss": 0.0498, "num_tokens": 465874.0, "reward": -2.264209508895874, "reward_std": 3.1193079948425293, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.2678064107894897, "rewards/env_reward/std": 2.0125834941864014, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.0945, "frac_reward_zero_std": 0.0, "grad_norm": 2.398057460784912, "kl": 1.1671398282051086, "learning_rate": 3.466666666666667e-05, "loss": 0.0467, "num_tokens": 468342.0, "reward": -0.32055363059043884, "reward_std": 0.08688756823539734, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.03036908432841301, "rewards/env_reward/std": 0.05792504921555519, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 7.333333492279053, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.095, "frac_reward_zero_std": 0.0, "grad_norm": 2.1980197429656982, "kl": 1.4500057846307755, "learning_rate": 3.4555555555555556e-05, "loss": 0.058, "num_tokens": 470796.0, "reward": 0.38532906770706177, "reward_std": 0.15841148793697357, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.4402194023132324, "rewards/env_reward/std": 0.10560767352581024, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0955, "frac_reward_zero_std": 0.0, "grad_norm": 2.9431092739105225, "kl": 1.4747809767723083, "learning_rate": 3.444444444444445e-05, "loss": 0.059, "num_tokens": 473289.0, "reward": -1.0166206359863281, "reward_std": 0.02063235081732273, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.49441370368003845, "rewards/env_reward/std": 0.0137548903003335, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.096, "frac_reward_zero_std": 0.0, "grad_norm": 7.505349159240723, "kl": 1.3912545293569565, "learning_rate": 3.433333333333333e-05, "loss": 0.0557, "num_tokens": 475757.0, "reward": -0.6259548664093018, "reward_std": 2.8915553092956543, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.08980958163738251, "rewards/belief_accuracy/std": 0.02038082852959633, "rewards/env_reward/mean": -0.22935077548027039, "rewards/env_reward/std": 1.8575823307037354, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 16.666667938232422, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0965, "frac_reward_zero_std": 0.0, "grad_norm": 2.4287314414978027, "kl": 1.0645422227680683, "learning_rate": 3.4222222222222224e-05, "loss": 0.0426, "num_tokens": 478239.0, "reward": -1.1361416578292847, "reward_std": 2.5584716796875, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.18546631932258606, "rewards/belief_accuracy/std": 0.059523556381464005, "rewards/env_reward/mean": -0.3864951729774475, "rewards/env_reward/std": 1.7521181106567383, "rewards/format_valid/mean": 0.25, "rewards/format_valid/std": 1.5, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.097, "frac_reward_zero_std": 0.0, "grad_norm": 2.376044511795044, "kl": 0.6852857172489166, "learning_rate": 3.411111111111111e-05, "loss": 0.0274, "num_tokens": 480748.0, "reward": 0.5333235263824463, "reward_std": 0.08749997615814209, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": 0.5680490136146545, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0975, "frac_reward_zero_std": 0.0, "grad_norm": 3.4659929275512695, "kl": 1.7528847455978394, "learning_rate": 3.4000000000000007e-05, "loss": 0.0701, "num_tokens": 483203.0, "reward": 1.018233299255371, "reward_std": 0.009551048278808594, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.862155556678772, "rewards/env_reward/std": 0.006367385853081942, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.098, "frac_reward_zero_std": 0.0, "grad_norm": 7.004715442657471, "kl": 0.813011210411787, "learning_rate": 3.388888888888889e-05, "loss": 0.0325, "num_tokens": 485711.0, "reward": -2.4635062217712402, "reward_std": 2.8720784187316895, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.4006710052490234, "rewards/env_reward/std": 1.8473838567733765, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0985, "frac_reward_zero_std": 0.0, "grad_norm": 3.282201051712036, "kl": 0.7132957000285387, "learning_rate": 3.377777777777778e-05, "loss": 0.0285, "num_tokens": 488216.0, "reward": -3.926431894302368, "reward_std": 2.047135829925537, "rewards/action_legal/mean": -0.75, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -2.346787929534912, "rewards/env_reward/std": 1.3064239025115967, "rewards/format_valid/mean": -1.375, "rewards/format_valid/std": 1.25, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.099, "frac_reward_zero_std": 0.0, "grad_norm": 1.8574674129486084, "kl": 1.1133069694042206, "learning_rate": 3.366666666666667e-05, "loss": 0.0445, "num_tokens": 490705.0, "reward": 0.6392979621887207, "reward_std": 0.2728678584098816, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.044744670391082764, "rewards/belief_accuracy/std": 0.11051066219806671, "rewards/env_reward/mean": 0.4948546886444092, "rewards/env_reward/std": 0.41126659512519836, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0995, "frac_reward_zero_std": 0.0, "grad_norm": 2.2241978645324707, "kl": 1.400051310658455, "learning_rate": 3.355555555555556e-05, "loss": 0.056, "num_tokens": 493194.0, "reward": 0.16355225443840027, "reward_std": 0.2303662747144699, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.29236820340156555, "rewards/env_reward/std": 0.153577521443367, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 27.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1, "frac_reward_zero_std": 0.0, "grad_norm": 3.1241228580474854, "kl": 0.7815838046371937, "learning_rate": 3.3444444444444443e-05, "loss": 0.0313, "num_tokens": 495713.0, "reward": -1.4204142093658447, "reward_std": 2.6858582496643066, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.7344428300857544, "rewards/env_reward/std": 1.739694356918335, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 13.666666984558105, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1005, "frac_reward_zero_std": 0.0, "grad_norm": 3.880967617034912, "kl": 1.6194000542163849, "learning_rate": 3.3333333333333335e-05, "loss": 0.0648, "num_tokens": 498186.0, "reward": -1.1433579921722412, "reward_std": 2.5394091606140137, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5497387051582336, "rewards/env_reward/std": 1.6346454620361328, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.101, "frac_reward_zero_std": 0.0, "grad_norm": 3.6157476902008057, "kl": 1.4809669330716133, "learning_rate": 3.322222222222222e-05, "loss": 0.0592, "num_tokens": 500648.0, "reward": -0.7693363428115845, "reward_std": 2.7953262329101562, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.0716666653752327, "rewards/belief_accuracy/std": 0.05666666850447655, "rewards/env_reward/mean": -0.3612242341041565, "rewards/env_reward/std": 1.7597646713256836, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.1015, "frac_reward_zero_std": 0.0, "grad_norm": 2.5044844150543213, "kl": 0.5702618137001991, "learning_rate": 3.311111111111112e-05, "loss": 0.0228, "num_tokens": 503079.0, "reward": -0.09709322452545166, "reward_std": 0.09302432835102081, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.1680680364370346, "rewards/belief_accuracy/std": 0.03100811131298542, "rewards/env_reward/mean": 0.2380739152431488, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.102, "frac_reward_zero_std": 0.0, "grad_norm": 2.225451707839966, "kl": 0.6446680650115013, "learning_rate": 3.3e-05, "loss": 0.0258, "num_tokens": 505599.0, "reward": -1.0992940664291382, "reward_std": 2.570491075515747, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": -0.49119603633880615, "rewards/env_reward/std": 1.673166036605835, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1025, "frac_reward_zero_std": 0.0, "grad_norm": 11.69345760345459, "kl": 0.9406535923480988, "learning_rate": 3.2888888888888894e-05, "loss": 0.0376, "num_tokens": 508093.0, "reward": -1.0868068933486938, "reward_std": 2.6226813793182373, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5120379328727722, "rewards/env_reward/std": 1.6912070512771606, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.103, "frac_reward_zero_std": 0.0, "grad_norm": 3.8141772747039795, "kl": 0.38117800280451775, "learning_rate": 3.277777777777778e-05, "loss": 0.0152, "num_tokens": 510301.0, "reward": 0.5716937780380249, "reward_std": 0.2175557017326355, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.34779584407806396, "rewards/env_reward/std": 0.14503712952136993, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 18.33333396911621, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1035, "frac_reward_zero_std": 0.0, "grad_norm": 2.4618332386016846, "kl": 1.3801769241690636, "learning_rate": 3.266666666666667e-05, "loss": 0.0552, "num_tokens": 512788.0, "reward": -1.348587989807129, "reward_std": 2.476418972015381, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6865587830543518, "rewards/env_reward/std": 1.5944546461105347, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.104, "frac_reward_zero_std": 0.0, "grad_norm": 7.705834865570068, "kl": 1.0085995495319366, "learning_rate": 3.2555555555555555e-05, "loss": 0.0403, "num_tokens": 515256.0, "reward": -1.0385560989379883, "reward_std": 2.6225454807281494, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.4798707962036133, "rewards/env_reward/std": 1.6903735399246216, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 11.333333969116211, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1045, "frac_reward_zero_std": 0.0, "grad_norm": 5.924132823944092, "kl": 1.7609535232186317, "learning_rate": 3.2444444444444446e-05, "loss": 0.0704, "num_tokens": 517722.0, "reward": -1.3413997888565063, "reward_std": 2.4143919944763184, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.6817665696144104, "rewards/env_reward/std": 1.5514785051345825, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.105, "frac_reward_zero_std": 0.0, "grad_norm": 7.369988918304443, "kl": 1.1772667318582535, "learning_rate": 3.233333333333333e-05, "loss": 0.0471, "num_tokens": 520224.0, "reward": 0.25493913888931274, "reward_std": 0.33257579803466797, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3532927930355072, "rewards/env_reward/std": 0.2217172235250473, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1055, "frac_reward_zero_std": 0.0, "grad_norm": 2.5913333892822266, "kl": 0.7529645264148712, "learning_rate": 3.222222222222223e-05, "loss": 0.0301, "num_tokens": 522738.0, "reward": -1.3859096765518188, "reward_std": 2.4013755321502686, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.7114397883415222, "rewards/env_reward/std": 1.5432217121124268, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.106, "frac_reward_zero_std": 0.0, "grad_norm": 4.363038539886475, "kl": 1.2446223124861717, "learning_rate": 3.2111111111111114e-05, "loss": 0.0498, "num_tokens": 525236.0, "reward": -2.56288743019104, "reward_std": 2.7684178352355957, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.466925024986267, "rewards/env_reward/std": 1.7785577774047852, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1065, "frac_reward_zero_std": 0.0, "grad_norm": 4.272139549255371, "kl": 1.5829559713602066, "learning_rate": 3.2000000000000005e-05, "loss": 0.0633, "num_tokens": 527450.0, "reward": 1.3202344179153442, "reward_std": 0.7838823199272156, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.84682297706604, "rewards/env_reward/std": 0.5225882530212402, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 15.666666984558105, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.107, "frac_reward_zero_std": 0.0, "grad_norm": 5.658717632293701, "kl": 1.2298424392938614, "learning_rate": 3.188888888888889e-05, "loss": 0.0492, "num_tokens": 529929.0, "reward": -1.5011694431304932, "reward_std": 2.299220323562622, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.78827965259552, "rewards/env_reward/std": 1.474480390548706, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 20.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1075, "frac_reward_zero_std": 0.0, "grad_norm": 3.059485912322998, "kl": 0.9900188595056534, "learning_rate": 3.177777777777778e-05, "loss": 0.0396, "num_tokens": 532434.0, "reward": -2.2220005989074707, "reward_std": 3.1529808044433594, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.2396671772003174, "rewards/env_reward/std": 2.0346951484680176, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.108, "frac_reward_zero_std": 0.0, "grad_norm": 5.903228282928467, "kl": 0.9180602729320526, "learning_rate": 3.1666666666666666e-05, "loss": 0.0367, "num_tokens": 534946.0, "reward": 0.06937577575445175, "reward_std": 0.3579734265804291, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.05766364932060242, "rewards/belief_accuracy/std": 0.084672711789608, "rewards/env_reward/mean": 0.14074449241161346, "rewards/env_reward/std": 0.25905489921569824, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 16.666667938232422, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.1085, "frac_reward_zero_std": 0.0, "grad_norm": 7.5056562423706055, "kl": 1.428985133767128, "learning_rate": 3.155555555555556e-05, "loss": 0.0572, "num_tokens": 537428.0, "reward": -0.04300477355718613, "reward_std": 0.1483583301305771, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.15466348826885223, "rewards/env_reward/std": 0.09890555590391159, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.109, "frac_reward_zero_std": 0.0, "grad_norm": 2.7241830825805664, "kl": 0.9578761979937553, "learning_rate": 3.144444444444445e-05, "loss": 0.0383, "num_tokens": 539948.0, "reward": -1.9331963062286377, "reward_std": 2.0602900981903076, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.0716666653752327, "rewards/belief_accuracy/std": 0.05666666850447655, "rewards/env_reward/mean": -1.137130856513977, "rewards/env_reward/std": 1.2618883848190308, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1095, "frac_reward_zero_std": 0.0, "grad_norm": 4.32493782043457, "kl": 0.58867571875453, "learning_rate": 3.1333333333333334e-05, "loss": 0.0235, "num_tokens": 542468.0, "reward": -1.2151740789413452, "reward_std": 2.491729736328125, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5976160764694214, "rewards/env_reward/std": 1.6028647422790527, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.11, "frac_reward_zero_std": 0.0, "grad_norm": 1.791263461112976, "kl": 0.3568975552916527, "learning_rate": 3.1222222222222225e-05, "loss": 0.0143, "num_tokens": 544900.0, "reward": 0.03496697545051575, "reward_std": 0.06415002793073654, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.17913591861724854, "rewards/belief_accuracy/std": 0.021383339539170265, "rewards/env_reward/mean": 0.3482498526573181, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1105, "frac_reward_zero_std": 0.0, "grad_norm": 2.383763313293457, "kl": 0.8193067982792854, "learning_rate": 3.111111111111111e-05, "loss": 0.0328, "num_tokens": 547414.0, "reward": 1.203812599182129, "reward_std": 0.6176812648773193, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9858750700950623, "rewards/env_reward/std": 0.41178756952285767, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.111, "frac_reward_zero_std": 0.0, "grad_norm": 2.43298602104187, "kl": 1.0093542635440826, "learning_rate": 3.1e-05, "loss": 0.0404, "num_tokens": 549923.0, "reward": -0.0007572025060653687, "reward_std": 0.016494423151016235, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10740740597248077, "rewards/belief_accuracy/std": 0.014814812690019608, "rewards/env_reward/mean": 0.19347669184207916, "rewards/env_reward/std": 0.010300002992153168, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.1115, "frac_reward_zero_std": 0.0, "grad_norm": 6.343346118927002, "kl": 0.7701031491160393, "learning_rate": 3.088888888888889e-05, "loss": 0.0308, "num_tokens": 552439.0, "reward": -1.5342886447906494, "reward_std": 2.2832674980163574, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.08815178275108337, "rewards/belief_accuracy/std": 0.023696430027484894, "rewards/env_reward/mean": -0.8382222652435303, "rewards/env_reward/std": 1.4423881769180298, "rewards/format_valid/mean": 0.0, "rewards/format_valid/std": 1.3540064096450806, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.112, "frac_reward_zero_std": 0.0, "grad_norm": 3.0897319316864014, "kl": 0.8843832314014435, "learning_rate": 3.077777777777778e-05, "loss": 0.0354, "num_tokens": 554948.0, "reward": 0.1540832221508026, "reward_std": 0.3211406171321869, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.08613713085651398, "rewards/belief_accuracy/std": 0.02772573195397854, "rewards/env_reward/mean": 0.25416308641433716, "rewards/env_reward/std": 0.18371644616127014, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1125, "frac_reward_zero_std": 0.0, "grad_norm": 14.237890243530273, "kl": 0.8649509251117706, "learning_rate": 3.066666666666667e-05, "loss": 0.0346, "num_tokens": 557456.0, "reward": 1.5224132537841797, "reward_std": 1.171297311782837, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.0333574041724205, "rewards/belief_accuracy/std": 0.13328517973423004, "rewards/env_reward/mean": 1.060823678970337, "rewards/env_reward/std": 0.900728166103363, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 26.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.113, "frac_reward_zero_std": 0.0, "grad_norm": 2.398747205734253, "kl": 0.5030911080539227, "learning_rate": 3.055555555555556e-05, "loss": 0.0201, "num_tokens": 559973.0, "reward": -0.10044729709625244, "reward_std": 0.14265108108520508, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.11636848002672195, "rewards/env_reward/std": 0.09510072320699692, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 1.0, "completions/max_length": 32.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "epoch": 0.1135, "frac_reward_zero_std": 0.0, "grad_norm": 3.1962196826934814, "kl": 0.7515930682420731, "learning_rate": 3.044444444444445e-05, "loss": 0.0301, "num_tokens": 562501.0, "reward": 0.33509939908981323, "reward_std": 0.4751393795013428, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.406732976436615, "rewards/env_reward/std": 0.3167595863342285, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.114, "frac_reward_zero_std": 0.0, "grad_norm": 3.724390745162964, "kl": 1.3262446075677872, "learning_rate": 3.0333333333333337e-05, "loss": 0.053, "num_tokens": 565018.0, "reward": -2.383183717727661, "reward_std": 2.9639039039611816, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.3471225500106812, "rewards/env_reward/std": 1.9085785150527954, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1145, "frac_reward_zero_std": 0.0, "grad_norm": 3.0508358478546143, "kl": 0.805017001926899, "learning_rate": 3.0222222222222225e-05, "loss": 0.0322, "num_tokens": 567513.0, "reward": -1.0678391456604004, "reward_std": 2.5887842178344727, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.4993927776813507, "rewards/env_reward/std": 1.6675386428833008, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.115, "frac_reward_zero_std": 0.0, "grad_norm": 2.8914108276367188, "kl": 1.2341727763414383, "learning_rate": 3.0111111111111113e-05, "loss": 0.0494, "num_tokens": 570012.0, "reward": -1.0324312448501587, "reward_std": 0.03943846374750137, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.5049540996551514, "rewards/env_reward/std": 0.026292279362678528, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1155, "frac_reward_zero_std": 0.0, "grad_norm": 3.649041175842285, "kl": 0.9338645786046982, "learning_rate": 3e-05, "loss": 0.0374, "num_tokens": 572521.0, "reward": -2.383704423904419, "reward_std": 2.9854514598846436, "rewards/action_legal/mean": -0.5, "rewards/action_legal/std": 0.5773502588272095, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -1.347469687461853, "rewards/env_reward/std": 1.9234607219696045, "rewards/format_valid/mean": -0.75, "rewards/format_valid/std": 1.4433757066726685, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.116, "frac_reward_zero_std": 0.0, "grad_norm": 5.132113933563232, "kl": 1.6651656776666641, "learning_rate": 2.988888888888889e-05, "loss": 0.0666, "num_tokens": 575028.0, "reward": -0.17734336853027344, "reward_std": 0.43010595440864563, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": 0.09427107125520706, "rewards/env_reward/std": 0.236448734998703, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1165, "frac_reward_zero_std": 0.0, "grad_norm": 2.701383590698242, "kl": 1.4364068657159805, "learning_rate": 2.9777777777777777e-05, "loss": 0.0575, "num_tokens": 577518.0, "reward": 0.2805197834968567, "reward_std": 0.16961893439292908, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3703465461730957, "rewards/env_reward/std": 0.11307929456233978, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.117, "frac_reward_zero_std": 0.0, "grad_norm": 4.781160354614258, "kl": 0.3255625441670418, "learning_rate": 2.9666666666666672e-05, "loss": 0.013, "num_tokens": 579950.0, "reward": 0.2083221822977066, "reward_std": 0.367115318775177, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.15008686482906342, "rewards/belief_accuracy/std": 0.12237177044153214, "rewards/env_reward/mean": 0.40572187304496765, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.1175, "frac_reward_zero_std": 0.0, "grad_norm": 3.328470468521118, "kl": 1.0922381281852722, "learning_rate": 2.955555555555556e-05, "loss": 0.0437, "num_tokens": 582468.0, "reward": 0.41766709089279175, "reward_std": 0.20472979545593262, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.46177807450294495, "rewards/env_reward/std": 0.13648654520511627, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.118, "frac_reward_zero_std": 0.0, "grad_norm": 3.3024332523345947, "kl": 1.729993849992752, "learning_rate": 2.9444444444444448e-05, "loss": 0.0692, "num_tokens": 584951.0, "reward": 0.5654071569442749, "reward_std": 0.20379649102687836, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.13415177166461945, "rewards/belief_accuracy/std": 0.06830354034900665, "rewards/env_reward/mean": 0.6244083642959595, "rewards/env_reward/std": 0.07622048258781433, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1185, "frac_reward_zero_std": 0.0, "grad_norm": 3.8457822799682617, "kl": 2.1087397560477257, "learning_rate": 2.9333333333333336e-05, "loss": 0.0843, "num_tokens": 587433.0, "reward": 1.2190449237823486, "reward_std": 0.21189068257808685, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.9960300326347351, "rewards/env_reward/std": 0.1412605196237564, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.119, "frac_reward_zero_std": 1.0, "grad_norm": 0.029239589348435402, "kl": 0.5205878019332886, "learning_rate": 2.9222222222222224e-05, "loss": 0.0208, "num_tokens": 589641.0, "reward": 0.6929494738578796, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.4286329746246338, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1195, "frac_reward_zero_std": 0.0, "grad_norm": 5.953148365020752, "kl": 1.5010789930820465, "learning_rate": 2.9111111111111112e-05, "loss": 0.06, "num_tokens": 592116.0, "reward": -0.19302129745483398, "reward_std": 0.11821135133504868, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.05465248227119446, "rewards/env_reward/std": 0.07880757749080658, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.12, "frac_reward_zero_std": 1.0, "grad_norm": 0.028769580647349358, "kl": 0.5208476185798645, "learning_rate": 2.9e-05, "loss": 0.0208, "num_tokens": 594324.0, "reward": 0.7799785137176514, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": 0.0, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.48665234446525574, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1205, "frac_reward_zero_std": 0.0, "grad_norm": 3.084097146987915, "kl": 1.6838389933109283, "learning_rate": 2.8888888888888888e-05, "loss": 0.0674, "num_tokens": 596812.0, "reward": -0.08287781476974487, "reward_std": 3.244748115539551, "rewards/action_legal/mean": -0.25, "rewards/action_legal/std": 0.5, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.15724816918373108, "rewards/env_reward/std": 2.104832172393799, "rewards/format_valid/mean": -0.125, "rewards/format_valid/std": 1.25, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 12.333333969116211, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.121, "frac_reward_zero_std": 0.0, "grad_norm": 2.273799180984497, "kl": 1.8537000715732574, "learning_rate": 2.877777777777778e-05, "loss": 0.0741, "num_tokens": 599281.0, "reward": 0.5991692543029785, "reward_std": 0.3846488893032074, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5827795267105103, "rewards/env_reward/std": 0.25643259286880493, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 25.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1215, "frac_reward_zero_std": 0.0, "grad_norm": 6.259014129638672, "kl": 1.056531861424446, "learning_rate": 2.8666666666666668e-05, "loss": 0.0423, "num_tokens": 601802.0, "reward": 1.1337945461273193, "reward_std": 0.15877185761928558, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.11666667461395264, "rewards/belief_accuracy/std": 0.03333333507180214, "rewards/env_reward/mean": 0.9683631062507629, "rewards/env_reward/std": 0.1437581330537796, "rewards/format_valid/mean": 0.625, "rewards/format_valid/std": 0.25, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.122, "frac_reward_zero_std": 0.0, "grad_norm": 3.7003579139709473, "kl": 0.6380213499069214, "learning_rate": 2.855555555555556e-05, "loss": 0.0255, "num_tokens": 604330.0, "reward": 0.300573468208313, "reward_std": 0.29817959666252136, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.3837156891822815, "rewards/env_reward/std": 0.1987864077091217, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.75, "completions/max_length": 32.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1225, "frac_reward_zero_std": 0.0, "grad_norm": 6.614965438842773, "kl": 1.0555044412612915, "learning_rate": 2.8444444444444447e-05, "loss": 0.0422, "num_tokens": 606840.0, "reward": 0.2127276510000229, "reward_std": 0.07096138596534729, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.32515180110931396, "rewards/env_reward/std": 0.04730759561061859, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.123, "frac_reward_zero_std": 0.0, "grad_norm": 4.199899673461914, "kl": 1.0866071283817291, "learning_rate": 2.8333333333333335e-05, "loss": 0.0435, "num_tokens": 609314.0, "reward": 0.6166397929191589, "reward_std": 0.013659524731338024, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.5944265127182007, "rewards/env_reward/std": 0.009106338024139404, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.25, "completions/max_length": 32.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 17.33333396911621, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1235, "frac_reward_zero_std": 1.0, "grad_norm": 0.5806828737258911, "kl": 2.1766858994960785, "learning_rate": 2.8222222222222223e-05, "loss": 0.0871, "num_tokens": 611798.0, "reward": -0.33692148327827454, "reward_std": 0.0, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": -0.04128097742795944, "rewards/env_reward/std": 0.0, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.5, "completions/max_length": 32.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.124, "frac_reward_zero_std": 0.0, "grad_norm": 2.1608145236968994, "kl": 0.6925233453512192, "learning_rate": 2.811111111111111e-05, "loss": 0.0277, "num_tokens": 614279.0, "reward": -0.17361339926719666, "reward_std": 0.17766423523426056, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.06759107857942581, "rewards/env_reward/std": 0.1184428334236145, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1245, "frac_reward_zero_std": 0.0, "grad_norm": 4.571046352386475, "kl": 1.0061021000146866, "learning_rate": 2.8000000000000003e-05, "loss": 0.0402, "num_tokens": 616713.0, "reward": 0.13527683913707733, "reward_std": 0.11952438950538635, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.1666666716337204, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.39018458127975464, "rewards/env_reward/std": 0.07968293130397797, "rewards/format_valid/mean": 1.0, "rewards/format_valid/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.125, "frac_reward_zero_std": 0.0, "grad_norm": 8.622580528259277, "kl": 3.023313194513321, "learning_rate": 2.788888888888889e-05, "loss": 0.1209, "num_tokens": 619160.0, "reward": 1.0675362348556519, "reward_std": 0.21132755279541016, "rewards/action_legal/mean": 0.0, "rewards/action_legal/std": 0.0, "rewards/belief_accuracy/mean": -0.10000000149011612, "rewards/belief_accuracy/std": 0.0, "rewards/env_reward/mean": 0.8950241804122925, "rewards/env_reward/std": 0.14088504016399384, "rewards/format_valid/mean": 0.5, "rewards/format_valid/std": 0.0, "step": 250 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 619160, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }