From 48aa4e7a78f1cc6d672a721d8aa832f9a9de1c15 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Fri, 10 Apr 2026 18:10:56 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: johnjeanc/OpenRS-GRPO Source: Original Platform --- .gitattributes | 36 + README.md | 70 + all_results.json | 8 + config.json | 30 + generation_config.json | 9 + model.safetensors | 3 + special_tokens_map.json | 23 + tokenizer.json | 3 + tokenizer_config.json | 195 ++ train_phase1_results.json | 8 + train_results.json | 8 + trainer_state.json | 4242 +++++++++++++++++++++++++++++++++++++ training_args.bin | 3 + 13 files changed, 4638 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_phase1_results.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..9de5b15 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +datasets: knoveleng/open-rs +library_name: transformers +model_name: OpenRS-GRPO +tags: +- generated_from_trainer +- open-r1 +- trl +- grpo +licence: license +--- + +# Model Card for OpenRS-GRPO + +This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) on the [knoveleng/open-rs](https://huggingface.co/datasets/knoveleng/open-rs) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="johnjeanc/OpenRS-GRPO", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/112703024-national-chengchi-university/huggingface/runs/j80wyoaz) + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- TRL: 0.16.0.dev0 +- Transformers: 4.49.0 +- Pytorch: 2.5.1 +- Datasets: 4.8.4 +- Tokenizers: 0.21.4 + +## Citations + +Cite GRPO as: + +```bibtex +@article{zhihong2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} + +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..5262ef4 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 0.09362538470280318, + "train_runtime": 37056.8122, + "train_samples": 7000, + "train_samples_per_second": 0.291, + "train_steps_per_second": 0.008 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..7052064 --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": 4096, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.49.0", + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..01dfe4b --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95, + "transformers_version": "4.49.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..b7bf770 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da80c97dd074126fa337874e65fb83d2ec1620d12b7ce7d3067e9a9ec05c69c8 +size 3554214752 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..1d385d6 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..e7cd2c1 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a +size 11422959 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ef6e98c --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,195 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/train_phase1_results.json b/train_phase1_results.json new file mode 100644 index 0000000..9bceb03 --- /dev/null +++ b/train_phase1_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 0.04891544524503115, + "train_runtime": 53118.1614, + "train_samples": 7000, + "train_samples_per_second": 0.339, + "train_steps_per_second": 0.005 +} \ No newline at end of file diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..5262ef4 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 0.0, + "train_loss": 0.09362538470280318, + "train_runtime": 37056.8122, + "train_samples": 7000, + "train_samples_per_second": 0.291, + "train_steps_per_second": 0.008 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..470dcfd --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,4242 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2570694087403599, + "eval_steps": 500, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 3156.8612060546875, + "epoch": 0.000856898029134533, + "grad_norm": 0.08138668537139893, + "kl": 0.0, + "learning_rate": 3.3333333333333334e-08, + "loss": 0.0383, + "reward": -2.583333373069763, + "reward_std": 0.09128709882497787, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4166666716337204, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 3294.638916015625, + "epoch": 0.001713796058269066, + "grad_norm": 0.3450618088245392, + "kl": 0.0, + "learning_rate": 6.666666666666667e-08, + "loss": 0.1933, + "reward": -2.6666667461395264, + "reward_std": 0.2221490517258644, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3333333432674408, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 3111.7222900390625, + "epoch": 0.002570694087403599, + "grad_norm": 0.4438176453113556, + "kl": 4.208087921142578e-05, + "learning_rate": 1e-07, + "loss": 0.1213, + "reward": -2.435185194015503, + "reward_std": 0.46697859466075897, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.4722222238779068, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 3262.8612060546875, + "epoch": 0.003427592116538132, + "grad_norm": 0.1571539044380188, + "kl": 3.17692756652832e-05, + "learning_rate": 1.3333333333333334e-07, + "loss": 0.1405, + "reward": -2.6944445371627808, + "reward_std": 0.15410767495632172, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3055555671453476, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 3076.166748046875, + "epoch": 0.004284490145672665, + "grad_norm": 0.0744488388299942, + "kl": 4.2378902435302734e-05, + "learning_rate": 1.6666666666666665e-07, + "loss": 0.0441, + "reward": -2.4722222089767456, + "reward_std": 0.15410767495632172, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5277777910232544, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 3411.5277099609375, + "epoch": 0.005141388174807198, + "grad_norm": 0.1373002976179123, + "kl": 2.950429916381836e-05, + "learning_rate": 2e-07, + "loss": 0.0469, + "reward": -2.583333373069763, + "reward_std": 0.33668186515569687, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.361111119389534, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 3255.5556640625, + "epoch": 0.005998286203941731, + "grad_norm": 0.1415766477584839, + "kl": 3.135204315185547e-05, + "learning_rate": 2.3333333333333333e-07, + "loss": 0.0729, + "reward": -2.5, + "reward_std": 0.2453947737812996, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5000000149011612, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 2561.388916015625, + "epoch": 0.006855184233076264, + "grad_norm": 0.3355005085468292, + "kl": 3.647804260253906e-05, + "learning_rate": 2.6666666666666667e-07, + "loss": 0.0923, + "reward": -2.170351266860962, + "reward_std": 0.6446175873279572, + "rewards/confidence_reward": -2.809240221977234, + "rewards/format_reward": 0.6388888955116272, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 3178.75, + "epoch": 0.007712082262210797, + "grad_norm": 0.2505446970462799, + "kl": 2.8371810913085938e-05, + "learning_rate": 3e-07, + "loss": 0.2008, + "reward": -2.4722222089767456, + "reward_std": 0.409944087266922, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.4722222238779068, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 2501.1944580078125, + "epoch": 0.00856898029134533, + "grad_norm": 0.2691773772239685, + "kl": 2.0295381546020508e-05, + "learning_rate": 3.333333333333333e-07, + "loss": 0.1208, + "reward": -2.4166667461395264, + "reward_std": 0.15410767495632172, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5833333283662796, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 3531.8333740234375, + "epoch": 0.009425878320479864, + "grad_norm": 0.1814609318971634, + "kl": 3.612041473388672e-05, + "learning_rate": 3.666666666666666e-07, + "loss": 0.0473, + "reward": -2.555555582046509, + "reward_std": 0.3082153648138046, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4444444552063942, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 3414.6666259765625, + "epoch": 0.010282776349614395, + "grad_norm": 0.14940866827964783, + "kl": 4.470348358154297e-05, + "learning_rate": 4e-07, + "loss": 0.1065, + "reward": -2.6388888359069824, + "reward_std": 0.25061557441949844, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3611111268401146, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 2698.2222900390625, + "epoch": 0.011139674378748929, + "grad_norm": 0.193411186337471, + "kl": 3.445148468017578e-05, + "learning_rate": 4.3333333333333335e-07, + "loss": 0.0232, + "reward": -2.2592592239379883, + "reward_std": 0.37295398116111755, + "rewards/confidence_reward": -2.814814805984497, + "rewards/format_reward": 0.5555555671453476, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 1606.9166870117188, + "epoch": 0.011996572407883462, + "grad_norm": 0.1921149641275406, + "kl": 2.7477741241455078e-05, + "learning_rate": 4.6666666666666666e-07, + "loss": 0.0873, + "reward": -1.929610550403595, + "reward_std": 0.4556204080581665, + "rewards/confidence_reward": -2.7907216548919678, + "rewards/format_reward": 0.8611111044883728, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 2907.9444580078125, + "epoch": 0.012853470437017995, + "grad_norm": 0.1692344844341278, + "kl": 2.485513687133789e-05, + "learning_rate": 5e-07, + "loss": 0.0894, + "reward": -2.305555582046509, + "reward_std": 0.22736986726522446, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.6944444477558136, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 2834.5833740234375, + "epoch": 0.013710368466152529, + "grad_norm": 0.20961543917655945, + "kl": 2.6166439056396484e-05, + "learning_rate": 5.333333333333333e-07, + "loss": 0.1212, + "reward": -2.3611111640930176, + "reward_std": 0.4495189040899277, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.5833333283662796, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 1908.2777709960938, + "epoch": 0.01456726649528706, + "grad_norm": 0.13441088795661926, + "kl": 1.5820842236280441e-06, + "learning_rate": 5.666666666666666e-07, + "loss": 0.0448, + "reward": -2.25, + "reward_std": 0.15410767495632172, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.75, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 3585.5001220703125, + "epoch": 0.015424164524421594, + "grad_norm": 0.23104918003082275, + "kl": 3.343820571899414e-05, + "learning_rate": 6e-07, + "loss": 0.1208, + "reward": -2.6666667461395264, + "reward_std": 0.3314610570669174, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3333333432674408, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 2827.1943969726562, + "epoch": 0.016281062553556127, + "grad_norm": 0.24534867703914642, + "kl": 2.0720064640045166e-05, + "learning_rate": 6.333333333333332e-07, + "loss": 0.0672, + "reward": -2.555555582046509, + "reward_std": 0.245394766330719, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4444444477558136, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 4031.5556640625, + "epoch": 0.01713796058269066, + "grad_norm": 0.14566344022750854, + "kl": 3.6835670471191406e-05, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0251, + "reward": -2.8333332538604736, + "reward_std": 0.30821534991264343, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.1666666716337204, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 2811.638916015625, + "epoch": 0.017994858611825194, + "grad_norm": 0.2733408510684967, + "kl": 1.6957521438598633e-05, + "learning_rate": 7e-07, + "loss": 0.1159, + "reward": -2.3611111640930176, + "reward_std": 0.48556874692440033, + "rewards/confidence_reward": -2.888888955116272, + "rewards/format_reward": 0.5277777910232544, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 3465.8056640625, + "epoch": 0.018851756640959727, + "grad_norm": 0.197701096534729, + "kl": 2.8759241104125977e-05, + "learning_rate": 7.333333333333332e-07, + "loss": 0.0588, + "reward": -2.694444417953491, + "reward_std": 0.2901904359459877, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.25000000186264515, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 3026.4722900390625, + "epoch": 0.01970865467009426, + "grad_norm": 0.3089195787906647, + "kl": 1.558661460876465e-05, + "learning_rate": 7.666666666666667e-07, + "loss": 0.2173, + "reward": -2.555555582046509, + "reward_std": 0.2221490517258644, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4444444552063942, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 3507.5555419921875, + "epoch": 0.02056555269922879, + "grad_norm": 0.40053591132164, + "kl": 5.6743621826171875e-05, + "learning_rate": 8e-07, + "loss": 0.1994, + "reward": -2.6944445371627808, + "reward_std": 0.3995024412870407, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3055555671453476, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 3597.361083984375, + "epoch": 0.021422450728363324, + "grad_norm": 0.3277747929096222, + "kl": 2.2545456886291504e-05, + "learning_rate": 8.333333333333333e-07, + "loss": 0.1546, + "reward": -2.583333373069763, + "reward_std": 0.49601033329963684, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4166666716337204, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 2460.694580078125, + "epoch": 0.022279348757497857, + "grad_norm": 0.24351875483989716, + "kl": 3.9458274841308594e-05, + "learning_rate": 8.666666666666667e-07, + "loss": 0.1574, + "reward": -2.305555582046509, + "reward_std": 0.22736985981464386, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.6944444477558136, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 3784.3056640625, + "epoch": 0.02313624678663239, + "grad_norm": 0.14974988996982574, + "kl": 0.0001430511474609375, + "learning_rate": 9e-07, + "loss": 0.0993, + "reward": -2.7777777910232544, + "reward_std": 0.2453947588801384, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.22222222574055195, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 2524.4444580078125, + "epoch": 0.023993144815766924, + "grad_norm": 0.2757478356361389, + "kl": 0.00017023086547851562, + "learning_rate": 9.333333333333333e-07, + "loss": 0.0924, + "reward": -2.305555582046509, + "reward_std": 0.3995024487376213, + "rewards/confidence_reward": -2.888888955116272, + "rewards/format_reward": 0.5833333432674408, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 3612.0556640625, + "epoch": 0.024850042844901457, + "grad_norm": 0.31738027930259705, + "kl": 0.00018072128295898438, + "learning_rate": 9.666666666666666e-07, + "loss": 0.1543, + "reward": -2.6666667461395264, + "reward_std": 0.4803479313850403, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3333333358168602, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 3717.0555419921875, + "epoch": 0.02570694087403599, + "grad_norm": 0.223888099193573, + "kl": 0.00014829635620117188, + "learning_rate": 1e-06, + "loss": 0.1178, + "reward": -2.583333373069763, + "reward_std": 0.49601036310195923, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4166666716337204, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 2863.02783203125, + "epoch": 0.026563838903170524, + "grad_norm": 0.15052570402622223, + "kl": 0.0007700920104980469, + "learning_rate": 9.99969538601693e-07, + "loss": 0.1475, + "reward": -2.472222328186035, + "reward_std": 0.25061558187007904, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5277777761220932, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 2220.388916015625, + "epoch": 0.027420736932305057, + "grad_norm": 0.1104821264743805, + "kl": 0.0006427764892578125, + "learning_rate": 9.998781585307575e-07, + "loss": 0.0251, + "reward": -2.1944445371627808, + "reward_std": 0.2901904359459877, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.75, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 2779.8055419921875, + "epoch": 0.028277634961439587, + "grad_norm": 0.2474696934223175, + "kl": 0.00033664703369140625, + "learning_rate": 9.997258721585931e-07, + "loss": 0.0417, + "reward": -2.2870370149612427, + "reward_std": 0.4910118281841278, + "rewards/confidence_reward": -2.8148149251937866, + "rewards/format_reward": 0.5277777910232544, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 3405.7777099609375, + "epoch": 0.02913453299057412, + "grad_norm": 0.11548831313848495, + "kl": 0.0003676414489746094, + "learning_rate": 9.99512700102336e-07, + "loss": 0.0451, + "reward": -2.7222222089767456, + "reward_std": 0.15932847559452057, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.2777777835726738, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 3638.1666259765625, + "epoch": 0.029991431019708654, + "grad_norm": 0.22043903172016144, + "kl": 0.0002741813659667969, + "learning_rate": 9.992386712220707e-07, + "loss": 0.1144, + "reward": -2.6666667461395264, + "reward_std": 0.31865697354078293, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3333333432674408, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 3693.27783203125, + "epoch": 0.030848329048843187, + "grad_norm": 0.008012472651898861, + "kl": 0.0003859996795654297, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0, + "reward": -2.833333373069763, + "reward_std": 0.0, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.1666666716337204, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 3796.9444580078125, + "epoch": 0.031705227077977724, + "grad_norm": 0.18162299692630768, + "kl": 0.001140594482421875, + "learning_rate": 9.985081996200277e-07, + "loss": 0.056, + "reward": -2.75, + "reward_std": 0.22736985236406326, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.25000000186264515, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 3224.5001220703125, + "epoch": 0.032562125107112254, + "grad_norm": 0.33491846919059753, + "kl": 0.00153350830078125, + "learning_rate": 9.98051855792412e-07, + "loss": 0.222, + "reward": -2.4722222089767456, + "reward_std": 0.4983728677034378, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5277777910232544, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 3594.0, + "epoch": 0.033419023136246784, + "grad_norm": 0.17942766845226288, + "kl": 0.0010013580322265625, + "learning_rate": 9.975348529157229e-07, + "loss": 0.0885, + "reward": -2.6111111640930176, + "reward_std": 0.36798951029777527, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.3333333432674408, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 3537.7222900390625, + "epoch": 0.03427592116538132, + "grad_norm": 0.20143228769302368, + "kl": 0.001697540283203125, + "learning_rate": 9.969572609838744e-07, + "loss": 0.1538, + "reward": -2.555555582046509, + "reward_std": 0.3814775347709656, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4444444626569748, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 3175.02783203125, + "epoch": 0.03513281919451585, + "grad_norm": 0.16529610753059387, + "kl": 0.000911712646484375, + "learning_rate": 9.963191581935677e-07, + "loss": 0.1, + "reward": -2.555555582046509, + "reward_std": 0.15932848304510117, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4444444477558136, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 3616.7777099609375, + "epoch": 0.03598971722365039, + "grad_norm": 0.08064991980791092, + "kl": 0.001148223876953125, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0023, + "reward": -2.7777777910232544, + "reward_std": 0.08606629818677902, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.2222222276031971, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 3938.3055419921875, + "epoch": 0.03684661525278492, + "grad_norm": 0.12845046818256378, + "kl": 0.00202178955078125, + "learning_rate": 9.948617737737001e-07, + "loss": 0.0375, + "reward": -2.805555582046509, + "reward_std": 0.1773533970117569, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.1944444477558136, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 3115.9722900390625, + "epoch": 0.037703513281919454, + "grad_norm": 0.1982233226299286, + "kl": 0.0035247802734375, + "learning_rate": 9.940426894506606e-07, + "loss": 0.1154, + "reward": -2.3240740299224854, + "reward_std": 0.49022431671619415, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.5833333432674408, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 3329.9444580078125, + "epoch": 0.038560411311053984, + "grad_norm": 0.2199009209871292, + "kl": 0.00379180908203125, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0548, + "reward": -2.6388888359069824, + "reward_std": 0.36345263570547104, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.30555556900799274, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 3532.666748046875, + "epoch": 0.03941730934018852, + "grad_norm": 0.1083880066871643, + "kl": 0.0015277862548828125, + "learning_rate": 9.922242910178859e-07, + "loss": 0.0935, + "reward": -2.638888955116272, + "reward_std": 0.29313045740127563, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.3055555671453476, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 2992.1944580078125, + "epoch": 0.04027420736932305, + "grad_norm": 1.405205488204956, + "kl": 0.0645751953125, + "learning_rate": 9.912252230901906e-07, + "loss": 0.0511, + "reward": -2.5, + "reward_std": 0.3582318127155304, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.4444444477558136, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 3138.666748046875, + "epoch": 0.04113110539845758, + "grad_norm": 0.19926148653030396, + "kl": 0.02202606201171875, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0487, + "reward": -2.5, + "reward_std": 0.13608276098966599, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 2550.6944580078125, + "epoch": 0.04198800342759212, + "grad_norm": 0.2365693897008896, + "kl": 0.0066070556640625, + "learning_rate": 9.890480260828965e-07, + "loss": 0.1002, + "reward": -2.138888955116272, + "reward_std": 0.49953536689281464, + "rewards/confidence_reward": -2.8888888359069824, + "rewards/format_reward": 0.75, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 2925.8612060546875, + "epoch": 0.04284490145672665, + "grad_norm": 0.2880806028842926, + "kl": 0.0074005126953125, + "learning_rate": 9.878701917609207e-07, + "loss": 0.2009, + "reward": -2.4166667461395264, + "reward_std": 0.49203380942344666, + "rewards/confidence_reward": -2.833333373069763, + "rewards/format_reward": 0.4166666716337204, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 3871.27783203125, + "epoch": 0.043701799485861184, + "grad_norm": 0.1224374920129776, + "kl": 0.00347900390625, + "learning_rate": 9.866330768241983e-07, + "loss": 0.067, + "reward": -2.75, + "reward_std": 0.2634196802973747, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.2500000037252903, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 3002.5833740234375, + "epoch": 0.044558697514995714, + "grad_norm": 0.3084653913974762, + "kl": 0.010650634765625, + "learning_rate": 9.853368487582886e-07, + "loss": 0.1609, + "reward": -2.472222328186035, + "reward_std": 0.22736985981464386, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5277777761220932, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 3275.4444580078125, + "epoch": 0.04541559554413025, + "grad_norm": 0.1611953228712082, + "kl": 0.00553131103515625, + "learning_rate": 9.839816830517225e-07, + "loss": 0.0874, + "reward": -2.5277777910232544, + "reward_std": 0.34948598593473434, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4722222238779068, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 3493.4444580078125, + "epoch": 0.04627249357326478, + "grad_norm": 0.05148787423968315, + "kl": 0.00240325927734375, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0171, + "reward": -2.4166667461395264, + "reward_std": 0.09128709882497787, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5833333283662796, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 2541.5, + "epoch": 0.04712939160239932, + "grad_norm": 0.2660021483898163, + "kl": 0.0017642974853515625, + "learning_rate": 9.8109528054197e-07, + "loss": 0.0367, + "reward": -2.2685184478759766, + "reward_std": 0.5399755388498306, + "rewards/confidence_reward": -2.7962963581085205, + "rewards/format_reward": 0.5277777761220932, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 2758.02783203125, + "epoch": 0.04798628963153385, + "grad_norm": 0.1080583855509758, + "kl": 0.002521514892578125, + "learning_rate": 9.795644345114794e-07, + "loss": 0.0841, + "reward": -2.277777910232544, + "reward_std": 0.2221490517258644, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.7222222089767456, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 3181.0001220703125, + "epoch": 0.04884318766066838, + "grad_norm": 0.3695676922798157, + "kl": 0.0043849945068359375, + "learning_rate": 9.779754323328192e-07, + "loss": 0.2031, + "reward": -2.4166667461395264, + "reward_std": 0.34020692110061646, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5833333283662796, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 3258.4722900390625, + "epoch": 0.049700085689802914, + "grad_norm": 0.45151278376579285, + "kl": 0.002864837646484375, + "learning_rate": 9.76328489131448e-07, + "loss": 0.2348, + "reward": -2.5833332538604736, + "reward_std": 0.3995024561882019, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.416666679084301, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 3137.638916015625, + "epoch": 0.050556983718937444, + "grad_norm": 0.24236924946308136, + "kl": 0.007476806640625, + "learning_rate": 9.746238278771125e-07, + "loss": 0.0592, + "reward": -2.3888888359069824, + "reward_std": 0.245394766330719, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.6111111044883728, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 2517.861083984375, + "epoch": 0.05141388174807198, + "grad_norm": 0.17099355161190033, + "kl": 0.00344085693359375, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0848, + "reward": -2.1944445371627808, + "reward_std": 0.36345260590314865, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.75, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 3005.75, + "epoch": 0.05227077977720651, + "grad_norm": 0.17847158014774323, + "kl": 0.0050048828125, + "learning_rate": 9.71042282127789e-07, + "loss": 0.0104, + "reward": -2.379629611968994, + "reward_std": 0.2948459982872009, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.5277777761220932, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 2963.4722900390625, + "epoch": 0.05312767780634105, + "grad_norm": 0.22850683331489563, + "kl": 0.00327301025390625, + "learning_rate": 9.69165882516764e-07, + "loss": -0.0198, + "reward": -2.6111111640930176, + "reward_std": 0.1360827535390854, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.3333333432674408, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 2869.3056640625, + "epoch": 0.05398457583547558, + "grad_norm": 0.1911962777376175, + "kl": 0.001628875732421875, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0823, + "reward": -2.1574073433876038, + "reward_std": 0.4669785648584366, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.75, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 3161.9444580078125, + "epoch": 0.054841473864610114, + "grad_norm": 0.349700391292572, + "kl": 0.00498199462890625, + "learning_rate": 9.65243099959949e-07, + "loss": 0.2121, + "reward": -2.5, + "reward_std": 0.4047232121229172, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5000000149011612, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 2009.166748046875, + "epoch": 0.055698371893744644, + "grad_norm": 0.03013899177312851, + "kl": 0.00531768798828125, + "learning_rate": 9.631972480961233e-07, + "loss": 0.0002, + "reward": -2.3333332538604736, + "reward_std": 0.0, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.6666666865348816, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 3121.138916015625, + "epoch": 0.056555269922879174, + "grad_norm": 0.1924014538526535, + "kl": 0.003353118896484375, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0912, + "reward": -2.333333373069763, + "reward_std": 0.5140352547168732, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.6111111044883728, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 2877.4722900390625, + "epoch": 0.05741216795201371, + "grad_norm": 0.34053537249565125, + "kl": 0.005859375, + "learning_rate": 9.589380080381038e-07, + "loss": -0.0137, + "reward": -2.3425925970077515, + "reward_std": 0.5270616561174393, + "rewards/confidence_reward": -2.7037038803100586, + "rewards/format_reward": 0.361111119389534, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 3536.3333740234375, + "epoch": 0.05826906598114824, + "grad_norm": 0.24978837370872498, + "kl": 0.00376129150390625, + "learning_rate": 9.567251964768342e-07, + "loss": 0.1593, + "reward": -2.555555582046509, + "reward_std": 0.4675438404083252, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4444444477558136, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 3617.8612060546875, + "epoch": 0.05912596401028278, + "grad_norm": 0.23766648769378662, + "kl": 0.00360870361328125, + "learning_rate": 9.54457320834625e-07, + "loss": 0.1549, + "reward": -2.722222328186035, + "reward_std": 0.3814775347709656, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.2777777910232544, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 3170.4444580078125, + "epoch": 0.05998286203941731, + "grad_norm": 0.11585590243339539, + "kl": 0.0055084228515625, + "learning_rate": 9.521346881455354e-07, + "loss": 0.0739, + "reward": -2.5, + "reward_std": 0.2221490666270256, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.4444444626569748, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 3495.6666259765625, + "epoch": 0.060839760068551844, + "grad_norm": 0.25010788440704346, + "kl": 0.0036468505859375, + "learning_rate": 9.497576128568518e-07, + "loss": 0.0395, + "reward": -2.7041274309158325, + "reward_std": 0.316488653421402, + "rewards/confidence_reward": -2.8985718488693237, + "rewards/format_reward": 0.1944444477558136, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 2589.1111450195312, + "epoch": 0.061696658097686374, + "grad_norm": 0.08638693392276764, + "kl": 0.0056915283203125, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0357, + "reward": -2.4444445371627808, + "reward_std": 0.13608276844024658, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.5000000149011612, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 3842.9444580078125, + "epoch": 0.06255355612682091, + "grad_norm": 0.2187531441450119, + "kl": 0.003448486328125, + "learning_rate": 9.448414290795618e-07, + "loss": 0.1007, + "reward": -2.833333373069763, + "reward_std": 0.30821535736322403, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.16666667349636555, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 3221.8055419921875, + "epoch": 0.06341045415595545, + "grad_norm": 0.21624591946601868, + "kl": 0.00295257568359375, + "learning_rate": 9.42302986163543e-07, + "loss": 0.073, + "reward": -2.4444445371627808, + "reward_std": 0.3082153648138046, + "rewards/confidence_reward": -2.7777777910232544, + "rewards/format_reward": 0.3333333432674408, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 3737.4722900390625, + "epoch": 0.06426735218508997, + "grad_norm": 0.18189558386802673, + "kl": 0.007110595703125, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0809, + "reward": -2.6944445371627808, + "reward_std": 0.3134361654520035, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.305555559694767, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 3442.9722900390625, + "epoch": 0.06512425021422451, + "grad_norm": 0.11082806438207626, + "kl": 0.005889892578125, + "learning_rate": 9.370671165529144e-07, + "loss": 0.0364, + "reward": -2.5277777910232544, + "reward_std": 0.0680413767695427, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.472222238779068, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 3204.166748046875, + "epoch": 0.06598114824335904, + "grad_norm": 0.1811019480228424, + "kl": 0.007965087890625, + "learning_rate": 9.343703987112365e-07, + "loss": 0.1172, + "reward": -2.5277777910232544, + "reward_std": 0.37919675558805466, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.4166666865348816, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 3627.47216796875, + "epoch": 0.06683804627249357, + "grad_norm": 0.15231575071811676, + "kl": 0.00232696533203125, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0252, + "reward": -2.694444417953491, + "reward_std": 0.24017397314310074, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3055555559694767, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 2848.416748046875, + "epoch": 0.0676949443016281, + "grad_norm": 0.2135927677154541, + "kl": 0.0075836181640625, + "learning_rate": 9.288212223678658e-07, + "loss": 0.0625, + "reward": -2.407407283782959, + "reward_std": 0.44895367324352264, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.5, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 3405.77783203125, + "epoch": 0.06855184233076264, + "grad_norm": 0.12736141681671143, + "kl": 0.00734710693359375, + "learning_rate": 9.259695151358214e-07, + "loss": 0.0282, + "reward": -2.638888955116272, + "reward_std": 0.20412413775920868, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.361111119389534, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 3254.0555419921875, + "epoch": 0.06940874035989718, + "grad_norm": 0.20807288587093353, + "kl": 0.008697509765625, + "learning_rate": 9.230669076497687e-07, + "loss": 0.1148, + "reward": -2.638888955116272, + "reward_std": 0.20412413775920868, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.3055555522441864, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 3111.166748046875, + "epoch": 0.0702656383890317, + "grad_norm": 0.2130519151687622, + "kl": 0.006744384765625, + "learning_rate": 9.20113792876298e-07, + "loss": 0.1215, + "reward": -2.5, + "reward_std": 0.18257419764995575, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5000000149011612, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 2739.3333740234375, + "epoch": 0.07112253641816624, + "grad_norm": 0.2380674034357071, + "kl": 0.007965087890625, + "learning_rate": 9.171105706198774e-07, + "loss": 0.0922, + "reward": -2.2222222685813904, + "reward_std": 0.31865695118904114, + "rewards/confidence_reward": -2.833333373069763, + "rewards/format_reward": 0.6111111342906952, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 2850.4166259765625, + "epoch": 0.07197943444730077, + "grad_norm": 0.19383125007152557, + "kl": 0.00946044921875, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0087, + "reward": -1.9166666865348816, + "reward_std": 0.6418167352676392, + "rewards/confidence_reward": -2.4444446563720703, + "rewards/format_reward": 0.5277777910232544, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 2711.52783203125, + "epoch": 0.0728363324764353, + "grad_norm": 0.23535026609897614, + "kl": 0.004180908203125, + "learning_rate": 9.109554367397697e-07, + "loss": 0.1209, + "reward": -2.333333373069763, + "reward_std": 0.3412187397480011, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.6111111044883728, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 3125.9722900390625, + "epoch": 0.07369323050556983, + "grad_norm": 0.07005209475755692, + "kl": 0.010040283203125, + "learning_rate": 9.078043584226815e-07, + "loss": 0.0131, + "reward": -2.583333373069763, + "reward_std": 0.09128709882497787, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4166666567325592, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 3320.0555419921875, + "epoch": 0.07455012853470437, + "grad_norm": 0.1450621485710144, + "kl": 0.010986328125, + "learning_rate": 9.046048391230247e-07, + "loss": 0.0719, + "reward": -2.5277777910232544, + "reward_std": 0.20412413030862808, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4722222238779068, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 3126.52783203125, + "epoch": 0.07540702656383891, + "grad_norm": 0.14595212042331696, + "kl": 0.004852294921875, + "learning_rate": 9.013573120044966e-07, + "loss": 0.0592, + "reward": -2.3518518209457397, + "reward_std": 0.3861331045627594, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.5555555671453476, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 3651.5833740234375, + "epoch": 0.07626392459297343, + "grad_norm": 0.21023660898208618, + "kl": 0.006317138671875, + "learning_rate": 8.980622167302837e-07, + "loss": 0.0916, + "reward": -2.5740740299224854, + "reward_std": 0.5071772634983063, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.3333333358168602, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 3397.9722900390625, + "epoch": 0.07712082262210797, + "grad_norm": 0.188828244805336, + "kl": 0.0087890625, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0082, + "reward": -2.2870370149612427, + "reward_std": 0.40421056002378464, + "rewards/confidence_reward": -2.7037036418914795, + "rewards/format_reward": 0.4166666716337204, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 2552.52783203125, + "epoch": 0.0779777206512425, + "grad_norm": 0.40647614002227783, + "kl": 0.01544189453125, + "learning_rate": 8.91331112506991e-07, + "loss": 0.1232, + "reward": -1.8888888955116272, + "reward_std": 0.5773965641856194, + "rewards/confidence_reward": -2.444444477558136, + "rewards/format_reward": 0.5555555522441864, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 2823.6944580078125, + "epoch": 0.07883461868037704, + "grad_norm": 0.12339317798614502, + "kl": 0.0080413818359375, + "learning_rate": 8.878960148416747e-07, + "loss": 0.0168, + "reward": -2.3611111640930176, + "reward_std": 0.22736985236406326, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.638888880610466, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 3384.52783203125, + "epoch": 0.07969151670951156, + "grad_norm": 0.29699084162712097, + "kl": 0.00677490234375, + "learning_rate": 8.844151714648274e-07, + "loss": 0.1672, + "reward": -2.5, + "reward_std": 0.3814775347709656, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5000000149011612, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 3287.25, + "epoch": 0.0805484147386461, + "grad_norm": 0.39855116605758667, + "kl": 0.0213623046875, + "learning_rate": 8.808890536269229e-07, + "loss": 0.1707, + "reward": -2.416666626930237, + "reward_std": 0.3995024412870407, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5833333432674408, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 2682.75, + "epoch": 0.08140531276778064, + "grad_norm": 0.391999751329422, + "kl": 0.015533447265625, + "learning_rate": 8.773181387078719e-07, + "loss": 0.1891, + "reward": -2.3611111640930176, + "reward_std": 0.49601034820079803, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.5833333432674408, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 3296.9722900390625, + "epoch": 0.08226221079691516, + "grad_norm": 0.22848033905029297, + "kl": 0.0120391845703125, + "learning_rate": 8.737029101523929e-07, + "loss": 0.1268, + "reward": -2.3240740299224854, + "reward_std": 0.5216506272554398, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.5833333432674408, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 2389.4166870117188, + "epoch": 0.0831191088260497, + "grad_norm": 0.2833496034145355, + "kl": 0.0255126953125, + "learning_rate": 8.700438574045617e-07, + "loss": 0.239, + "reward": -2.0092591643333435, + "reward_std": 0.788471519947052, + "rewards/confidence_reward": -2.759259343147278, + "rewards/format_reward": 0.75, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 2865.9722900390625, + "epoch": 0.08397600685518423, + "grad_norm": 0.18639419972896576, + "kl": 0.014617919921875, + "learning_rate": 8.663414758415478e-07, + "loss": 0.0585, + "reward": -2.333333373069763, + "reward_std": 0.3314610570669174, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.6666666567325592, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 2753.9444580078125, + "epoch": 0.08483290488431877, + "grad_norm": 0.26271870732307434, + "kl": 0.021240234375, + "learning_rate": 8.625962667065487e-07, + "loss": 0.1419, + "reward": -2.25, + "reward_std": 0.41698990762233734, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.6944444477558136, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 3196.52783203125, + "epoch": 0.0856898029134533, + "grad_norm": 0.2569756805896759, + "kl": 0.027587890625, + "learning_rate": 8.588087370409302e-07, + "loss": 0.0869, + "reward": -2.2129629254341125, + "reward_std": 0.642838716506958, + "rewards/confidence_reward": -2.574074149131775, + "rewards/format_reward": 0.361111119389534, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 3108.25, + "epoch": 0.08654670094258783, + "grad_norm": 0.221235990524292, + "kl": 0.0069427490234375, + "learning_rate": 8.549793996155795e-07, + "loss": 0.1286, + "reward": -2.3518518209457397, + "reward_std": 0.5122157335281372, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.5555555820465088, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 3036.388916015625, + "epoch": 0.08740359897172237, + "grad_norm": 0.19860585033893585, + "kl": 0.01312255859375, + "learning_rate": 8.511087728614862e-07, + "loss": 0.1027, + "reward": -2.472222328186035, + "reward_std": 0.4092601239681244, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.4722222238779068, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 3937.0833740234375, + "epoch": 0.08826049700085689, + "grad_norm": 0.18265724182128906, + "kl": 0.0120849609375, + "learning_rate": 8.471973807995534e-07, + "loss": 0.0519, + "reward": -2.805555582046509, + "reward_std": 0.3134361505508423, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.19444444961845875, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 2821.6944580078125, + "epoch": 0.08911739502999143, + "grad_norm": 0.1616680771112442, + "kl": 0.013641357421875, + "learning_rate": 8.432457529696548e-07, + "loss": 0.0466, + "reward": -2.240740656852722, + "reward_std": 0.409378819167614, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.6666666567325592, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 3072.27783203125, + "epoch": 0.08997429305912596, + "grad_norm": 0.27748724818229675, + "kl": 0.008331298828125, + "learning_rate": 8.392544243589427e-07, + "loss": 0.1194, + "reward": -2.305555582046509, + "reward_std": 0.6821095794439316, + "rewards/confidence_reward": -2.833333373069763, + "rewards/format_reward": 0.5277777910232544, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 3736.3333740234375, + "epoch": 0.0908311910882605, + "grad_norm": 0.19383865594863892, + "kl": 0.014678955078125, + "learning_rate": 8.352239353294194e-07, + "loss": 0.0574, + "reward": -2.5, + "reward_std": 0.37219424545764923, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.4444444477558136, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 3856.583251953125, + "epoch": 0.09168808911739502, + "grad_norm": 0.17508208751678467, + "kl": 0.0128173828125, + "learning_rate": 8.31154831544782e-07, + "loss": 0.0391, + "reward": -2.6944445371627808, + "reward_std": 0.24017397314310074, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.305555559694767, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 3584.5555419921875, + "epoch": 0.09254498714652956, + "grad_norm": 0.15831252932548523, + "kl": 0.017791748046875, + "learning_rate": 8.270476638965461e-07, + "loss": 0.0035, + "reward": -2.518518567085266, + "reward_std": 0.47219936549663544, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.3888889029622078, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 3814.5555419921875, + "epoch": 0.0934018851756641, + "grad_norm": 0.2039646953344345, + "kl": 0.012939453125, + "learning_rate": 8.229029884294662e-07, + "loss": 0.0669, + "reward": -2.666666626930237, + "reward_std": 0.3547067865729332, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3333333358168602, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 3702.0, + "epoch": 0.09425878320479864, + "grad_norm": 0.19251351058483124, + "kl": 0.00970458984375, + "learning_rate": 8.187213662662538e-07, + "loss": 0.0819, + "reward": -2.5833332538604736, + "reward_std": 0.3995024412870407, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.416666679084301, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 3664.5833740234375, + "epoch": 0.09511568123393316, + "grad_norm": 0.10979479551315308, + "kl": 0.013031005859375, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0533, + "reward": -2.75, + "reward_std": 0.15410767495632172, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.25, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 3430.5556640625, + "epoch": 0.0959725792630677, + "grad_norm": 0.21925541758537292, + "kl": 0.012115478515625, + "learning_rate": 8.102495512755938e-07, + "loss": 0.0942, + "reward": -2.5833332538604736, + "reward_std": 0.2738612964749336, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.416666679084301, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 3218.166748046875, + "epoch": 0.09682947729220223, + "grad_norm": 0.2610246241092682, + "kl": 0.014556884765625, + "learning_rate": 8.059605053962833e-07, + "loss": 0.1001, + "reward": -2.435185194015503, + "reward_std": 0.39413511008024216, + "rewards/confidence_reward": -2.8518519401550293, + "rewards/format_reward": 0.4166666865348816, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 3630.8612060546875, + "epoch": 0.09768637532133675, + "grad_norm": 0.14595678448677063, + "kl": 0.011627197265625, + "learning_rate": 8.01636806561836e-07, + "loss": 0.0598, + "reward": -2.555555582046509, + "reward_std": 0.1360827535390854, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4444444477558136, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 3384.3055419921875, + "epoch": 0.09854327335047129, + "grad_norm": 0.1648566722869873, + "kl": 0.02392578125, + "learning_rate": 7.972790401318627e-07, + "loss": 0.103, + "reward": -2.6388888359069824, + "reward_std": 0.2901904359459877, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.30555556900799274, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 3751.388916015625, + "epoch": 0.09940017137960583, + "grad_norm": 0.21332553029060364, + "kl": 0.02044677734375, + "learning_rate": 7.928877960781808e-07, + "loss": 0.1117, + "reward": -2.75, + "reward_std": 0.3134361356496811, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.2500000037252903, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 3072.2501220703125, + "epoch": 0.10025706940874037, + "grad_norm": 0.1085459515452385, + "kl": 0.02020263671875, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0629, + "reward": -2.2685184478759766, + "reward_std": 0.2494850754737854, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.6388888955116272, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 3123.9722900390625, + "epoch": 0.10111396743787489, + "grad_norm": 0.16286125779151917, + "kl": 0.0162353515625, + "learning_rate": 7.840072575681468e-07, + "loss": 0.0787, + "reward": -2.4452918767929077, + "reward_std": 0.4294184446334839, + "rewards/confidence_reward": -2.889736294746399, + "rewards/format_reward": 0.4444444626569748, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 3960.3333740234375, + "epoch": 0.10197086546700942, + "grad_norm": 0.15160124003887177, + "kl": 0.01416015625, + "learning_rate": 7.795191653945538e-07, + "loss": 0.0512, + "reward": -2.805555582046509, + "reward_std": 0.32624027132987976, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.1944444477558136, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 3328.9722900390625, + "epoch": 0.10282776349614396, + "grad_norm": 0.1655929535627365, + "kl": 0.01312255859375, + "learning_rate": 7.75e-07, + "loss": 0.0925, + "reward": -2.4722222089767456, + "reward_std": 0.3134361580014229, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5277777910232544, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 3596.25, + "epoch": 0.1036846615252785, + "grad_norm": 0.08225465565919876, + "kl": 0.011199951171875, + "learning_rate": 7.704503732071391e-07, + "loss": 0.0074, + "reward": -2.6111111640930176, + "reward_std": 0.08606629818677902, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3888888955116272, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 3386.77783203125, + "epoch": 0.10454155955441302, + "grad_norm": 0.14785455167293549, + "kl": 0.02099609375, + "learning_rate": 7.658709009626109e-07, + "loss": 0.0373, + "reward": -2.6944445371627808, + "reward_std": 0.24017397314310074, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3055555671453476, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 3158.27783203125, + "epoch": 0.10539845758354756, + "grad_norm": 0.22167643904685974, + "kl": 0.02313232421875, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0777, + "reward": -2.444444417953491, + "reward_std": 0.3582318276166916, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.5000000149011612, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 2636.861083984375, + "epoch": 0.1062553556126821, + "grad_norm": 0.2819192707538605, + "kl": 0.01617431640625, + "learning_rate": 7.566249040241553e-07, + "loss": -0.0007, + "reward": -1.9629629850387573, + "reward_std": 0.656702309846878, + "rewards/confidence_reward": -2.574074149131775, + "rewards/format_reward": 0.6111111044883728, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 3987.638916015625, + "epoch": 0.10711225364181662, + "grad_norm": 0.15799356997013092, + "kl": 0.0233154296875, + "learning_rate": 7.51959631090208e-07, + "loss": 0.0367, + "reward": -2.833333373069763, + "reward_std": 0.245394766330719, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.1666666716337204, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 3636.0001220703125, + "epoch": 0.10796915167095116, + "grad_norm": 0.20148231089115143, + "kl": 0.012664794921875, + "learning_rate": 7.472670160550848e-07, + "loss": 0.1046, + "reward": -2.555555582046509, + "reward_std": 0.3814775347709656, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4444444477558136, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 3470.388916015625, + "epoch": 0.10882604970008569, + "grad_norm": 0.29233935475349426, + "kl": 0.018310546875, + "learning_rate": 7.425476942237444e-07, + "loss": 0.0973, + "reward": -2.5407216548919678, + "reward_std": 0.5416866987943649, + "rewards/confidence_reward": -2.8462772369384766, + "rewards/format_reward": 0.305555559694767, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 3344.111083984375, + "epoch": 0.10968294772922023, + "grad_norm": 0.12020324170589447, + "kl": 0.02020263671875, + "learning_rate": 7.37802304516818e-07, + "loss": 0.025, + "reward": -2.583333373069763, + "reward_std": 0.15410767495632172, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4166666716337204, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 3881.1944580078125, + "epoch": 0.11053984575835475, + "grad_norm": 0.17918746173381805, + "kl": 0.01629638671875, + "learning_rate": 7.330314893841101e-07, + "loss": 0.0508, + "reward": -2.6944445371627808, + "reward_std": 0.3995024263858795, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.305555559694767, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 2966.388916015625, + "epoch": 0.11139674378748929, + "grad_norm": 0.2491733282804489, + "kl": 0.02984619140625, + "learning_rate": 7.282358947176205e-07, + "loss": 0.075, + "reward": -2.3425925970077515, + "reward_std": 0.514257550239563, + "rewards/confidence_reward": -2.814814805984497, + "rewards/format_reward": 0.472222238779068, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 3056.888916015625, + "epoch": 0.11225364181662383, + "grad_norm": 0.1091020330786705, + "kl": 0.018524169921875, + "learning_rate": 7.234161697641017e-07, + "loss": 0.0298, + "reward": -2.5008474588394165, + "reward_std": 0.35615625977516174, + "rewards/confidence_reward": -2.889736294746399, + "rewards/format_reward": 0.3888888955116272, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 3280.1944580078125, + "epoch": 0.11311053984575835, + "grad_norm": 0.22998450696468353, + "kl": 0.016815185546875, + "learning_rate": 7.185729670371604e-07, + "loss": 0.0906, + "reward": -2.555555582046509, + "reward_std": 0.3082153648138046, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.3888888955116272, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 3573.75, + "epoch": 0.11396743787489289, + "grad_norm": 0.17413660883903503, + "kl": 0.018310546875, + "learning_rate": 7.137069422289181e-07, + "loss": 0.089, + "reward": -2.638888955116272, + "reward_std": 0.3134361431002617, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.361111119389534, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 3063.0555419921875, + "epoch": 0.11482433590402742, + "grad_norm": 0.214571014046669, + "kl": 0.03424072265625, + "learning_rate": 7.08818754121241e-07, + "loss": 0.039, + "reward": -2.2314815521240234, + "reward_std": 0.4910118132829666, + "rewards/confidence_reward": -2.7592592239379883, + "rewards/format_reward": 0.5277777910232544, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 3406.6112060546875, + "epoch": 0.11568123393316196, + "grad_norm": 0.20651200413703918, + "kl": 0.026092529296875, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0338, + "reward": -2.296296238899231, + "reward_std": 0.5582656562328339, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.6111111044883728, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 3469.6944580078125, + "epoch": 0.11653813196229648, + "grad_norm": 0.12644833326339722, + "kl": 0.01922607421875, + "learning_rate": 6.989785380482312e-07, + "loss": 0.0342, + "reward": -2.6944445371627808, + "reward_std": 0.1541076824069023, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.305555559694767, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 3475.4166259765625, + "epoch": 0.11739502999143102, + "grad_norm": 0.2094379961490631, + "kl": 0.02838134765625, + "learning_rate": 6.940278422906372e-07, + "loss": 0.1098, + "reward": -2.5, + "reward_std": 0.5836373865604401, + "rewards/confidence_reward": -2.888888955116272, + "rewards/format_reward": 0.3888888955116272, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 3328.0, + "epoch": 0.11825192802056556, + "grad_norm": 0.26488515734672546, + "kl": 0.019317626953125, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0928, + "reward": -2.260106682777405, + "reward_std": 0.5154568552970886, + "rewards/confidence_reward": -2.704551100730896, + "rewards/format_reward": 0.4444444626569748, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 2500.5833740234375, + "epoch": 0.11910882604970009, + "grad_norm": 0.22615168988704681, + "kl": 0.0201416015625, + "learning_rate": 6.840686264673168e-07, + "loss": 0.0361, + "reward": -2.000847280025482, + "reward_std": 0.6050563156604767, + "rewards/confidence_reward": -2.5564029216766357, + "rewards/format_reward": 0.5555555671453476, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 3513.1112060546875, + "epoch": 0.11996572407883462, + "grad_norm": 0.3215450644493103, + "kl": 0.02825927734375, + "learning_rate": 6.790614547199906e-07, + "loss": 0.1226, + "reward": -2.555555582046509, + "reward_std": 0.41752736270427704, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.4444444552063942, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 3207.638916015625, + "epoch": 0.12082262210796915, + "grad_norm": 0.2798071503639221, + "kl": 0.02069091796875, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0961, + "reward": -2.2870370149612427, + "reward_std": 0.6775233149528503, + "rewards/confidence_reward": -2.6481481790542603, + "rewards/format_reward": 0.3611111119389534, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 3339.1666259765625, + "epoch": 0.12167952013710369, + "grad_norm": 0.26966410875320435, + "kl": 0.02276611328125, + "learning_rate": 6.68995372916741e-07, + "loss": 0.0981, + "reward": -2.3518519401550293, + "reward_std": 0.44895367324352264, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.5555555522441864, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 3242.3055419921875, + "epoch": 0.12253641816623821, + "grad_norm": 0.2671264111995697, + "kl": 0.02325439453125, + "learning_rate": 6.639378256471608e-07, + "loss": 0.0918, + "reward": -2.3611111640930176, + "reward_std": 0.4583979994058609, + "rewards/confidence_reward": -2.722222328186035, + "rewards/format_reward": 0.361111119389534, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 2081.5, + "epoch": 0.12339331619537275, + "grad_norm": 0.25654128193855286, + "kl": 0.02740478515625, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0977, + "reward": -1.8148147463798523, + "reward_std": 0.7522204518318176, + "rewards/confidence_reward": -2.64814829826355, + "rewards/format_reward": 0.8333333432674408, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 3736.9166259765625, + "epoch": 0.12425021422450729, + "grad_norm": 0.12350251525640488, + "kl": 0.02294921875, + "learning_rate": 6.537771418340981e-07, + "loss": 0.0331, + "reward": -2.7222222089767456, + "reward_std": 0.2221490517258644, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.27777777798473835, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 2392.7222900390625, + "epoch": 0.12510711225364182, + "grad_norm": 0.5596352815628052, + "kl": 0.03955078125, + "learning_rate": 6.486753808845564e-07, + "loss": 0.2211, + "reward": -1.8425925374031067, + "reward_std": 0.754194438457489, + "rewards/confidence_reward": -2.6481481790542603, + "rewards/format_reward": 0.8055555820465088, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 2694.5001220703125, + "epoch": 0.12596401028277635, + "grad_norm": 0.0685691088438034, + "kl": 0.02557373046875, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0014, + "reward": -2.305555582046509, + "reward_std": 0.0680413767695427, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.6944444477558136, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 2063.6111450195312, + "epoch": 0.1268209083119109, + "grad_norm": 0.34025242924690247, + "kl": 0.03564453125, + "learning_rate": 6.384324742897735e-07, + "loss": 0.1248, + "reward": -2.0092591047286987, + "reward_std": 0.7437996864318848, + "rewards/confidence_reward": -2.759259343147278, + "rewards/format_reward": 0.75, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 3310.27783203125, + "epoch": 0.12767780634104542, + "grad_norm": 0.17515520751476288, + "kl": 0.02813720703125, + "learning_rate": 6.332927153701215e-07, + "loss": 0.105, + "reward": -2.3518518805503845, + "reward_std": 0.44590386003255844, + "rewards/confidence_reward": -2.8518519401550293, + "rewards/format_reward": 0.5000000074505806, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 3576.0833740234375, + "epoch": 0.12853470437017994, + "grad_norm": 0.540116012096405, + "kl": 0.0565185546875, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0537, + "reward": -2.638888955116272, + "reward_std": 0.3134361505508423, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3611111119389534, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 2819.3056640625, + "epoch": 0.1293916023993145, + "grad_norm": 0.3199455440044403, + "kl": 0.032958984375, + "learning_rate": 6.229800653975054e-07, + "loss": 0.1061, + "reward": -2.2314813137054443, + "reward_std": 0.6937831938266754, + "rewards/confidence_reward": -2.814814805984497, + "rewards/format_reward": 0.5833333432674408, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 2036.6388854980469, + "epoch": 0.13024850042844902, + "grad_norm": 0.25735169649124146, + "kl": 0.02972412109375, + "learning_rate": 6.178085705122674e-07, + "loss": 0.023, + "reward": -1.5749215185642242, + "reward_std": 1.027258962392807, + "rewards/confidence_reward": -2.3526992201805115, + "rewards/format_reward": 0.7777777910232544, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 3299.5833740234375, + "epoch": 0.13110539845758354, + "grad_norm": 0.14086535573005676, + "kl": 0.0323486328125, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0307, + "reward": -2.42592591047287, + "reward_std": 0.4229704663157463, + "rewards/confidence_reward": -2.8148149251937866, + "rewards/format_reward": 0.38888888247311115, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 3406.5001220703125, + "epoch": 0.1319622964867181, + "grad_norm": 0.258766233921051, + "kl": 0.03106689453125, + "learning_rate": 6.074387415372676e-07, + "loss": 0.1326, + "reward": -2.5000001192092896, + "reward_std": 0.30821535736322403, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5000000074505806, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 3476.388916015625, + "epoch": 0.1328191945158526, + "grad_norm": 0.20013348758220673, + "kl": 0.02679443359375, + "learning_rate": 6.022418113563535e-07, + "loss": 0.0258, + "reward": -2.472645878791809, + "reward_std": 0.39784130454063416, + "rewards/confidence_reward": -2.6948680877685547, + "rewards/format_reward": 0.22222222574055195, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 3603.3612060546875, + "epoch": 0.13367609254498714, + "grad_norm": 0.31663545966148376, + "kl": 0.027587890625, + "learning_rate": 5.97037808470444e-07, + "loss": 0.1284, + "reward": -2.4444445371627808, + "reward_std": 0.7150397002696991, + "rewards/confidence_reward": -2.888888955116272, + "rewards/format_reward": 0.4444444626569748, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 2918.666748046875, + "epoch": 0.13453299057412169, + "grad_norm": 0.2475612312555313, + "kl": 0.031494140625, + "learning_rate": 5.918274374182266e-07, + "loss": 0.0837, + "reward": -2.083333432674408, + "reward_std": 0.5944807976484299, + "rewards/confidence_reward": -2.666666626930237, + "rewards/format_reward": 0.5833333432674408, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 2515.8612060546875, + "epoch": 0.1353898886032562, + "grad_norm": 0.5117289423942566, + "kl": 0.0574951171875, + "learning_rate": 5.866114036005362e-07, + "loss": 0.2035, + "reward": -1.8888888955116272, + "reward_std": 0.8281392157077789, + "rewards/confidence_reward": -2.611111044883728, + "rewards/format_reward": 0.7222222089767456, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 3296.861083984375, + "epoch": 0.13624678663239073, + "grad_norm": 0.28699225187301636, + "kl": 0.02996826171875, + "learning_rate": 5.813904131848564e-07, + "loss": 0.1256, + "reward": -2.4629629850387573, + "reward_std": 0.4850034862756729, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.4444444626569748, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 3864.2777099609375, + "epoch": 0.13710368466152528, + "grad_norm": 0.21943415701389313, + "kl": 0.0389404296875, + "learning_rate": 5.761651730097142e-07, + "loss": 0.0667, + "reward": -2.6944445371627808, + "reward_std": 0.3995024561882019, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.305555559694767, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 3015.02783203125, + "epoch": 0.1379605826906598, + "grad_norm": 0.22274136543273926, + "kl": 0.03118896484375, + "learning_rate": 5.709363904889861e-07, + "loss": 0.0465, + "reward": -2.3518518209457397, + "reward_std": 0.47685495018959045, + "rewards/confidence_reward": -2.7962963581085205, + "rewards/format_reward": 0.4444444477558136, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 3587.5, + "epoch": 0.13881748071979436, + "grad_norm": 0.1837625652551651, + "kl": 0.042724609375, + "learning_rate": 5.657047735161255e-07, + "loss": 0.0174, + "reward": -2.5000001192092896, + "reward_std": 0.42796894907951355, + "rewards/confidence_reward": -2.833333373069763, + "rewards/format_reward": 0.3333333432674408, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 3590.4444580078125, + "epoch": 0.13967437874892888, + "grad_norm": 0.41009289026260376, + "kl": 0.0537109375, + "learning_rate": 5.604710303683253e-07, + "loss": 0.1195, + "reward": -2.4259259700775146, + "reward_std": 0.5152682885527611, + "rewards/confidence_reward": -2.7592592239379883, + "rewards/format_reward": 0.3333333432674408, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 3338.5, + "epoch": 0.1405312767780634, + "grad_norm": 0.22997497022151947, + "kl": 0.034423828125, + "learning_rate": 5.552358696106288e-07, + "loss": 0.1118, + "reward": -2.4907407760620117, + "reward_std": 0.5448963046073914, + "rewards/confidence_reward": -2.7962963581085205, + "rewards/format_reward": 0.305555559694767, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 3163.6944580078125, + "epoch": 0.14138817480719795, + "grad_norm": 0.29789188504219055, + "kl": 0.0311279296875, + "learning_rate": 5.5e-07, + "loss": 0.0972, + "reward": -2.2870370149612427, + "reward_std": 0.5160937160253525, + "rewards/confidence_reward": -2.7592592239379883, + "rewards/format_reward": 0.4722222238779068, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 2695.3333740234375, + "epoch": 0.14224507283633248, + "grad_norm": 0.25649595260620117, + "kl": 0.02960205078125, + "learning_rate": 5.447641303893714e-07, + "loss": 0.0524, + "reward": -1.7499999403953552, + "reward_std": 0.6183541268110275, + "rewards/confidence_reward": -2.3333334922790527, + "rewards/format_reward": 0.5833333432674408, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 2688.6944580078125, + "epoch": 0.143101970865467, + "grad_norm": 0.23350609838962555, + "kl": 0.037353515625, + "learning_rate": 5.395289696316747e-07, + "loss": 0.0783, + "reward": -2.1582546830177307, + "reward_std": 0.7079941481351852, + "rewards/confidence_reward": -2.7415881156921387, + "rewards/format_reward": 0.5833333358168602, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 3119.22216796875, + "epoch": 0.14395886889460155, + "grad_norm": 0.3477858006954193, + "kl": 0.0482177734375, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0932, + "reward": -2.129629611968994, + "reward_std": 0.7057251632213593, + "rewards/confidence_reward": -2.7407407760620117, + "rewards/format_reward": 0.6111111044883728, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 2954.166748046875, + "epoch": 0.14481576692373607, + "grad_norm": 0.16720406711101532, + "kl": 0.03704833984375, + "learning_rate": 5.29063609511014e-07, + "loss": 0.0403, + "reward": -2.1481481194496155, + "reward_std": 0.20082136243581772, + "rewards/confidence_reward": -2.5925925970077515, + "rewards/format_reward": 0.4444444626569748, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 3575.3333740234375, + "epoch": 0.1456726649528706, + "grad_norm": 0.3258434534072876, + "kl": 0.0556640625, + "learning_rate": 5.238348269902859e-07, + "loss": 0.1092, + "reward": -2.305555522441864, + "reward_std": 0.49953543394804, + "rewards/confidence_reward": -2.7222222089767456, + "rewards/format_reward": 0.416666679084301, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 3646.138916015625, + "epoch": 0.14652956298200515, + "grad_norm": 0.22214293479919434, + "kl": 0.0496826171875, + "learning_rate": 5.186095868151436e-07, + "loss": 0.0377, + "reward": -2.6111111640930176, + "reward_std": 0.222149059176445, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3888888955116272, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 2912.25, + "epoch": 0.14738646101113967, + "grad_norm": 0.24840864539146423, + "kl": 0.0555419921875, + "learning_rate": 5.133885963994639e-07, + "loss": 0.0503, + "reward": -1.9814814329147339, + "reward_std": 0.8633435666561127, + "rewards/confidence_reward": -2.481481671333313, + "rewards/format_reward": 0.5, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 3765.916748046875, + "epoch": 0.14824335904027422, + "grad_norm": 0.1615857183933258, + "kl": 0.05029296875, + "learning_rate": 5.081725625817735e-07, + "loss": 0.0285, + "reward": -2.6666667461395264, + "reward_std": 0.17213259637355804, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3333333432674408, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 4005.888916015625, + "epoch": 0.14910025706940874, + "grad_norm": 0.124207504093647, + "kl": 0.042724609375, + "learning_rate": 5.02962191529556e-07, + "loss": 0.0302, + "reward": -2.8611111640930176, + "reward_std": 0.15410767495632172, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.1388888955116272, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 3589.7222900390625, + "epoch": 0.14995715509854327, + "grad_norm": 0.38185247778892517, + "kl": 0.0694580078125, + "learning_rate": 4.977581886436462e-07, + "loss": 0.0932, + "reward": -2.5740740299224854, + "reward_std": 0.472199410200119, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.33333333022892475, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 3721.1666259765625, + "epoch": 0.15081405312767782, + "grad_norm": 0.35303542017936707, + "kl": 0.0589599609375, + "learning_rate": 4.925612584627324e-07, + "loss": 0.0818, + "reward": -2.722222328186035, + "reward_std": 0.3082153648138046, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.2777777910232544, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 3804.9444580078125, + "epoch": 0.15167095115681234, + "grad_norm": 0.37888699769973755, + "kl": 0.0810546875, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0564, + "reward": -2.7777777910232544, + "reward_std": 0.2221490517258644, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.22222222574055195, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 2334.8056640625, + "epoch": 0.15252784918594686, + "grad_norm": 0.49078041315078735, + "kl": 0.076171875, + "learning_rate": 4.821914294877326e-07, + "loss": 0.0592, + "reward": -1.5222030878067017, + "reward_std": 0.7962049543857574, + "rewards/confidence_reward": -2.2722031474113464, + "rewards/format_reward": 0.75, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 3370.944580078125, + "epoch": 0.1533847472150814, + "grad_norm": 0.2731315791606903, + "kl": 0.07763671875, + "learning_rate": 4.770199346024947e-07, + "loss": 0.0635, + "reward": -2.2685184478759766, + "reward_std": 0.6701640486717224, + "rewards/confidence_reward": -2.6296297311782837, + "rewards/format_reward": 0.361111119389534, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 3099.888916015625, + "epoch": 0.15424164524421594, + "grad_norm": 0.24881142377853394, + "kl": 0.0810546875, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0609, + "reward": -2.3072502613067627, + "reward_std": 0.5294433236122131, + "rewards/confidence_reward": -2.6683614253997803, + "rewards/format_reward": 0.36111112125217915, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 3101.72216796875, + "epoch": 0.15509854327335046, + "grad_norm": 1.0561292171478271, + "kl": 0.08544921875, + "learning_rate": 4.667072846298785e-07, + "loss": 0.2226, + "reward": -2.259259283542633, + "reward_std": 0.7795112058520317, + "rewards/confidence_reward": -2.703703761100769, + "rewards/format_reward": 0.4444444514811039, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 3213.9444580078125, + "epoch": 0.155955441302485, + "grad_norm": 1.2878997325897217, + "kl": 0.116455078125, + "learning_rate": 4.6156752571022637e-07, + "loss": 0.1861, + "reward": -2.1574074029922485, + "reward_std": 0.556253120303154, + "rewards/confidence_reward": -2.6296297311782837, + "rewards/format_reward": 0.472222238779068, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 3509.138916015625, + "epoch": 0.15681233933161953, + "grad_norm": 0.22494396567344666, + "kl": 0.082763671875, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0309, + "reward": -2.453703761100769, + "reward_std": 0.44099532812833786, + "rewards/confidence_reward": -2.8148149251937866, + "rewards/format_reward": 0.361111119389534, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 2710.97216796875, + "epoch": 0.15766923736075408, + "grad_norm": 0.5510160326957703, + "kl": 0.1025390625, + "learning_rate": 4.513246191154434e-07, + "loss": 0.0585, + "reward": -1.4814814329147339, + "reward_std": 1.0165367126464844, + "rewards/confidence_reward": -2.2037038803100586, + "rewards/format_reward": 0.7222222089767456, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 2879.8333740234375, + "epoch": 0.1585261353898886, + "grad_norm": 0.36165350675582886, + "kl": 0.10498046875, + "learning_rate": 4.4622285816590186e-07, + "loss": 0.0904, + "reward": -2.0740740299224854, + "reward_std": 0.648275762796402, + "rewards/confidence_reward": -2.7407408952713013, + "rewards/format_reward": 0.6666666865348816, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 3437.3612060546875, + "epoch": 0.15938303341902313, + "grad_norm": 0.8590340614318848, + "kl": 0.116455078125, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.1109, + "reward": -2.472222328186035, + "reward_std": 0.5155959874391556, + "rewards/confidence_reward": -2.888888955116272, + "rewards/format_reward": 0.4166666641831398, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 2466.5, + "epoch": 0.16023993144815768, + "grad_norm": 1.367401123046875, + "kl": 0.115234375, + "learning_rate": 4.360621743528392e-07, + "loss": 0.1976, + "reward": -2.037884473800659, + "reward_std": 0.6841873377561569, + "rewards/confidence_reward": -2.6489956378936768, + "rewards/format_reward": 0.6111111044883728, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 3534.27783203125, + "epoch": 0.1610968294772922, + "grad_norm": 0.3216284215450287, + "kl": 0.12939453125, + "learning_rate": 4.3100462708325914e-07, + "loss": 0.082, + "reward": -2.4722222089767456, + "reward_std": 0.3762567266821861, + "rewards/confidence_reward": -2.833333373069763, + "rewards/format_reward": 0.36111112125217915, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 3046.138916015625, + "epoch": 0.16195372750642673, + "grad_norm": 0.9523129463195801, + "kl": 0.142578125, + "learning_rate": 4.2596318988235037e-07, + "loss": 0.1065, + "reward": -2.120370388031006, + "reward_std": 0.5169950574636459, + "rewards/confidence_reward": -2.5370370149612427, + "rewards/format_reward": 0.4166666716337204, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 3720.416748046875, + "epoch": 0.16281062553556128, + "grad_norm": 3.160400629043579, + "kl": 0.734375, + "learning_rate": 4.209385452800095e-07, + "loss": 0.103, + "reward": -2.6944445371627808, + "reward_std": 0.32624026387929916, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3055555671453476, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 2635.75, + "epoch": 0.1636675235646958, + "grad_norm": 0.7508611679077148, + "kl": 0.119384765625, + "learning_rate": 4.1593137353268303e-07, + "loss": 0.1062, + "reward": -1.9452918767929077, + "reward_std": 0.717644564807415, + "rewards/confidence_reward": -2.7786251306533813, + "rewards/format_reward": 0.8333333432674408, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 3603.5277099609375, + "epoch": 0.16452442159383032, + "grad_norm": 0.4415644109249115, + "kl": 0.17431640625, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0745, + "reward": -2.5000001192092896, + "reward_std": 0.34894853830337524, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.444444440305233, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 3744.3612060546875, + "epoch": 0.16538131962296487, + "grad_norm": 0.7602421045303345, + "kl": 0.2197265625, + "learning_rate": 4.059721577093628e-07, + "loss": 0.1049, + "reward": -2.60185170173645, + "reward_std": 0.5628926083445549, + "rewards/confidence_reward": -2.8518519401550293, + "rewards/format_reward": 0.2500000074505806, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 3876.5833740234375, + "epoch": 0.1662382176520994, + "grad_norm": 0.7295367121696472, + "kl": 0.2158203125, + "learning_rate": 4.0102146195176887e-07, + "loss": 0.0704, + "reward": -2.6111111640930176, + "reward_std": 0.5082770437002182, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.3333333432674408, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 2431.638916015625, + "epoch": 0.16709511568123395, + "grad_norm": 0.7587151527404785, + "kl": 0.18994140625, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0699, + "reward": -1.851851761341095, + "reward_std": 0.8921651542186737, + "rewards/confidence_reward": -2.574074149131775, + "rewards/format_reward": 0.722222238779068, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 3446.888916015625, + "epoch": 0.16795201371036847, + "grad_norm": 1.1345640420913696, + "kl": 0.22265625, + "learning_rate": 3.911812458787591e-07, + "loss": 0.0911, + "reward": -2.407407283782959, + "reward_std": 0.4854222238063812, + "rewards/confidence_reward": -2.8518519401550293, + "rewards/format_reward": 0.4444444626569748, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 3630.3612060546875, + "epoch": 0.168808911739503, + "grad_norm": 1.328270673751831, + "kl": 0.26318359375, + "learning_rate": 3.86293057771082e-07, + "loss": 0.158, + "reward": -2.6018518209457397, + "reward_std": 0.5880227982997894, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.305555559694767, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 3384.02783203125, + "epoch": 0.16966580976863754, + "grad_norm": 0.4739760160446167, + "kl": 0.2294921875, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.0634, + "reward": -2.5, + "reward_std": 0.222149059176445, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5000000149011612, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 3382.8055419921875, + "epoch": 0.17052270779777207, + "grad_norm": 1.6106473207473755, + "kl": 0.306640625, + "learning_rate": 3.7658383023589833e-07, + "loss": 0.1537, + "reward": -2.3333333134651184, + "reward_std": 0.6125056594610214, + "rewards/confidence_reward": -2.7222222089767456, + "rewards/format_reward": 0.3888888880610466, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 3306.388916015625, + "epoch": 0.1713796058269066, + "grad_norm": 1.5265542268753052, + "kl": 0.2734375, + "learning_rate": 3.7176410528237945e-07, + "loss": 0.1782, + "reward": -2.1851852536201477, + "reward_std": 0.5692443251609802, + "rewards/confidence_reward": -2.629629611968994, + "rewards/format_reward": 0.4444444552063942, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 3631.8612060546875, + "epoch": 0.17223650385604114, + "grad_norm": 0.6827831864356995, + "kl": 0.2685546875, + "learning_rate": 3.6696851061588994e-07, + "loss": 0.0805, + "reward": -2.4814815521240234, + "reward_std": 0.5552610009908676, + "rewards/confidence_reward": -2.8148149251937866, + "rewards/format_reward": 0.3333333432674408, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 2938.4166259765625, + "epoch": 0.17309340188517566, + "grad_norm": 0.9321132898330688, + "kl": 0.30078125, + "learning_rate": 3.62197695483182e-07, + "loss": 0.0521, + "reward": -1.944444477558136, + "reward_std": 0.6303547322750092, + "rewards/confidence_reward": -2.3888888359069824, + "rewards/format_reward": 0.4444444626569748, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 3787.6112060546875, + "epoch": 0.17395029991431019, + "grad_norm": 0.8994116187095642, + "kl": 0.31640625, + "learning_rate": 3.5745230577625573e-07, + "loss": 0.0792, + "reward": -2.75, + "reward_std": 0.2901904284954071, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.2500000037252903, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 2426.9444580078125, + "epoch": 0.17480719794344474, + "grad_norm": 1.54979407787323, + "kl": 0.265625, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0357, + "reward": -1.6300533711910248, + "reward_std": 0.7251835763454437, + "rewards/confidence_reward": -2.3800532817840576, + "rewards/format_reward": 0.75, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 3264.6112060546875, + "epoch": 0.17566409597257926, + "grad_norm": 2.0934314727783203, + "kl": 0.22802734375, + "learning_rate": 3.4804036890979205e-07, + "loss": 0.1002, + "reward": -2.2971436977386475, + "reward_std": 0.725425198674202, + "rewards/confidence_reward": -2.7415882349014282, + "rewards/format_reward": 0.4444444477558136, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 3399.5001220703125, + "epoch": 0.17652099400171378, + "grad_norm": 0.9074891209602356, + "kl": 0.2763671875, + "learning_rate": 3.433750959758446e-07, + "loss": 0.0878, + "reward": -2.287036955356598, + "reward_std": 0.3180917277932167, + "rewards/confidence_reward": -2.5370370149612427, + "rewards/format_reward": 0.2500000074505806, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 4017.02783203125, + "epoch": 0.17737789203084833, + "grad_norm": 0.922761857509613, + "kl": 0.2080078125, + "learning_rate": 3.387377967463493e-07, + "loss": 0.0294, + "reward": -2.638888955116272, + "reward_std": 0.3995024412870407, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.361111119389534, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 3856.416748046875, + "epoch": 0.17823479005998286, + "grad_norm": 1.1995811462402344, + "kl": 0.3154296875, + "learning_rate": 3.3412909903738936e-07, + "loss": 0.078, + "reward": -2.7777777910232544, + "reward_std": 0.30821534991264343, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.2222222313284874, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 2836.25, + "epoch": 0.1790916880891174, + "grad_norm": 1.958234429359436, + "kl": 0.138671875, + "learning_rate": 3.295496267928609e-07, + "loss": 0.078, + "reward": -1.870370328426361, + "reward_std": 0.5447776764631271, + "rewards/confidence_reward": -2.4814815521240234, + "rewards/format_reward": 0.6111111342906952, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 3240.2777099609375, + "epoch": 0.17994858611825193, + "grad_norm": 6.24537992477417, + "kl": 0.2060546875, + "learning_rate": 3.250000000000001e-07, + "loss": 0.1677, + "reward": -2.046296238899231, + "reward_std": 0.6235695481300354, + "rewards/confidence_reward": -2.6296297311782837, + "rewards/format_reward": 0.5833333432674408, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 3614.166748046875, + "epoch": 0.18080548414738645, + "grad_norm": 0.7317115068435669, + "kl": 0.1962890625, + "learning_rate": 3.204808346054461e-07, + "loss": 0.0303, + "reward": -2.694444417953491, + "reward_std": 0.22736986726522446, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.2500000074505806, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 3610.52783203125, + "epoch": 0.181662382176521, + "grad_norm": 1.1794308423995972, + "kl": 0.22119140625, + "learning_rate": 3.159927424318531e-07, + "loss": 0.0538, + "reward": -2.666666626930237, + "reward_std": 0.2453947737812996, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3333333358168602, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 3340.8333740234375, + "epoch": 0.18251928020565553, + "grad_norm": 1.3902448415756226, + "kl": 0.216796875, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0582, + "reward": -2.5277777910232544, + "reward_std": 0.35992757976055145, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.4166666865348816, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 3454.6666259765625, + "epoch": 0.18337617823479005, + "grad_norm": 0.36289045214653015, + "kl": 0.1962890625, + "learning_rate": 3.0711220392181934e-07, + "loss": 0.0291, + "reward": -2.3055556416511536, + "reward_std": 0.4583980068564415, + "rewards/confidence_reward": -2.7222222089767456, + "rewards/format_reward": 0.4166666641831398, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 3966.5555419921875, + "epoch": 0.1842330762639246, + "grad_norm": 1.2311605215072632, + "kl": 0.2685546875, + "learning_rate": 3.027209598681373e-07, + "loss": 0.0363, + "reward": -2.638888955116272, + "reward_std": 0.4123065620660782, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.361111119389534, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 3175.3333740234375, + "epoch": 0.18508997429305912, + "grad_norm": 4.940278053283691, + "kl": 0.24658203125, + "learning_rate": 2.9836319343816397e-07, + "loss": 0.1492, + "reward": -2.037036955356598, + "reward_std": 0.7587747424840927, + "rewards/confidence_reward": -2.7592592239379883, + "rewards/format_reward": 0.722222238779068, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 3212.5277099609375, + "epoch": 0.18594687232219365, + "grad_norm": 7.882691383361816, + "kl": 0.310546875, + "learning_rate": 2.9403949460371677e-07, + "loss": 0.2234, + "reward": -2.2685185074806213, + "reward_std": 0.5562530681490898, + "rewards/confidence_reward": -2.6296297311782837, + "rewards/format_reward": 0.3611111082136631, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 3286.27783203125, + "epoch": 0.1868037703513282, + "grad_norm": 2.209719657897949, + "kl": 0.3505859375, + "learning_rate": 2.897504487244061e-07, + "loss": 0.0156, + "reward": -2.370370388031006, + "reward_std": 0.5552610009908676, + "rewards/confidence_reward": -2.8148149251937866, + "rewards/format_reward": 0.444444440305233, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 3131.8333740234375, + "epoch": 0.18766066838046272, + "grad_norm": 6.650064945220947, + "kl": 0.3505859375, + "learning_rate": 2.854966364683872e-07, + "loss": 0.2357, + "reward": -1.9518136978149414, + "reward_std": 0.8104376196861267, + "rewards/confidence_reward": -2.4518136978149414, + "rewards/format_reward": 0.5000000149011612, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 3573.47216796875, + "epoch": 0.18851756640959727, + "grad_norm": 1.0854192972183228, + "kl": 0.427734375, + "learning_rate": 2.812786337337463e-07, + "loss": 0.1052, + "reward": -2.5185184478759766, + "reward_std": 0.5389864593744278, + "rewards/confidence_reward": -2.8518518209457397, + "rewards/format_reward": 0.3333333432674408, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 3518.25, + "epoch": 0.1893744644387318, + "grad_norm": 2.382499933242798, + "kl": 0.46484375, + "learning_rate": 2.770970115705341e-07, + "loss": 0.0109, + "reward": -2.740740656852722, + "reward_std": 0.22680462896823883, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.1666666716337204, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 3254.4166259765625, + "epoch": 0.19023136246786632, + "grad_norm": 1.3864353895187378, + "kl": 0.4208984375, + "learning_rate": 2.729523361034538e-07, + "loss": 0.0677, + "reward": -2.388888955116272, + "reward_std": 0.3582318052649498, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.5555555522441864, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 3346.9444580078125, + "epoch": 0.19108826049700087, + "grad_norm": 1.5457087755203247, + "kl": 0.4716796875, + "learning_rate": 2.68845168455218e-07, + "loss": 0.0491, + "reward": -2.388888955116272, + "reward_std": 0.41752736270427704, + "rewards/confidence_reward": -2.888888955116272, + "rewards/format_reward": 0.5, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 2871.02783203125, + "epoch": 0.1919451585261354, + "grad_norm": 2.262847661972046, + "kl": 0.3876953125, + "learning_rate": 2.6477606467058035e-07, + "loss": 0.1352, + "reward": -1.7870370149612427, + "reward_std": 0.7662279903888702, + "rewards/confidence_reward": -2.3703705072402954, + "rewards/format_reward": 0.5833333432674408, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 3113.6944580078125, + "epoch": 0.1928020565552699, + "grad_norm": 1.4508579969406128, + "kl": 0.3828125, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.1048, + "reward": -1.7592592239379883, + "reward_std": 0.5638266205787659, + "rewards/confidence_reward": -2.3703704476356506, + "rewards/format_reward": 0.6111111044883728, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 3247.4444580078125, + "epoch": 0.19365895458440446, + "grad_norm": 0.9147429466247559, + "kl": 0.513671875, + "learning_rate": 2.567542470303452e-07, + "loss": 0.1006, + "reward": -2.1296297311782837, + "reward_std": 0.44895367324352264, + "rewards/confidence_reward": -2.574074149131775, + "rewards/format_reward": 0.4444444477558136, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 2793.9722290039062, + "epoch": 0.194515852613539, + "grad_norm": 1.462504506111145, + "kl": 0.3876953125, + "learning_rate": 2.528026192004466e-07, + "loss": 0.0483, + "reward": -1.8333333134651184, + "reward_std": 0.6497750803828239, + "rewards/confidence_reward": -2.3888890147209167, + "rewards/format_reward": 0.5555555522441864, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 3342.916748046875, + "epoch": 0.1953727506426735, + "grad_norm": 1.5023483037948608, + "kl": 0.494140625, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0784, + "reward": -2.4907407760620117, + "reward_std": 0.4541744738817215, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.4166666641831398, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 3135.138916015625, + "epoch": 0.19622964867180806, + "grad_norm": 2.827260971069336, + "kl": 0.53125, + "learning_rate": 2.450206003844205e-07, + "loss": 0.1925, + "reward": -2.130053400993347, + "reward_std": 0.9212456345558167, + "rewards/confidence_reward": -2.713386654853821, + "rewards/format_reward": 0.5833333432674408, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 2381.2500610351562, + "epoch": 0.19708654670094258, + "grad_norm": 4.513103008270264, + "kl": 0.28125, + "learning_rate": 2.411912629590699e-07, + "loss": 0.1358, + "reward": -1.7693658471107483, + "reward_std": 1.0916875302791595, + "rewards/confidence_reward": -2.4638103246688843, + "rewards/format_reward": 0.6944444477558136, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 3553.3055419921875, + "epoch": 0.19794344473007713, + "grad_norm": 2.0678796768188477, + "kl": 0.4443359375, + "learning_rate": 2.374037332934512e-07, + "loss": 0.0392, + "reward": -2.5, + "reward_std": 0.27888670563697815, + "rewards/confidence_reward": -2.833333373069763, + "rewards/format_reward": 0.3333333432674408, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 3584.0833740234375, + "epoch": 0.19880034275921166, + "grad_norm": 2.108079433441162, + "kl": 0.408203125, + "learning_rate": 2.336585241584522e-07, + "loss": 0.0707, + "reward": -2.6944445371627808, + "reward_std": 0.06804138422012329, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3055555671453476, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 3742.2777099609375, + "epoch": 0.19965724078834618, + "grad_norm": 2.253603219985962, + "kl": 0.4912109375, + "learning_rate": 2.299561425954383e-07, + "loss": 0.1806, + "reward": -2.6388888359069824, + "reward_std": 0.47208072245121, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.305555559694767, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 2601.3056640625, + "epoch": 0.20051413881748073, + "grad_norm": 4.904290676116943, + "kl": 0.3974609375, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.2103, + "reward": -1.3475482165813446, + "reward_std": 1.2650677561759949, + "rewards/confidence_reward": -2.0975481271743774, + "rewards/format_reward": 0.75, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 2671.6944580078125, + "epoch": 0.20137103684661525, + "grad_norm": 1.6857246160507202, + "kl": 0.3486328125, + "learning_rate": 2.2268186129212807e-07, + "loss": 0.0329, + "reward": -1.7870370149612427, + "reward_std": 0.40415799617767334, + "rewards/confidence_reward": -2.425925850868225, + "rewards/format_reward": 0.638888880610466, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 3342.5277099609375, + "epoch": 0.20222793487574978, + "grad_norm": 1.9274342060089111, + "kl": 0.4453125, + "learning_rate": 2.1911094637307714e-07, + "loss": 0.108, + "reward": -2.3796294927597046, + "reward_std": 0.48020143806934357, + "rewards/confidence_reward": -2.8518518209457397, + "rewards/format_reward": 0.4722222238779068, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 2953.388916015625, + "epoch": 0.20308483290488433, + "grad_norm": 2.970611810684204, + "kl": 0.4501953125, + "learning_rate": 2.1558482853517253e-07, + "loss": 0.1648, + "reward": -1.9360325932502747, + "reward_std": 0.8466525003314018, + "rewards/confidence_reward": -2.4082547426223755, + "rewards/format_reward": 0.4722222164273262, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 3137.4166259765625, + "epoch": 0.20394173093401885, + "grad_norm": 0.7161883115768433, + "kl": 0.5009765625, + "learning_rate": 2.1210398515832536e-07, + "loss": 0.0883, + "reward": -2.138888895511627, + "reward_std": 0.7037928402423859, + "rewards/confidence_reward": -2.6111111640930176, + "rewards/format_reward": 0.4722222164273262, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 3184.7222900390625, + "epoch": 0.20479862896315337, + "grad_norm": 2.5145044326782227, + "kl": 0.638671875, + "learning_rate": 2.08668887493009e-07, + "loss": 0.1968, + "reward": -2.231481432914734, + "reward_std": 1.0531733930110931, + "rewards/confidence_reward": -2.703703761100769, + "rewards/format_reward": 0.472222238779068, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 3364.638916015625, + "epoch": 0.20565552699228792, + "grad_norm": 1.4906601905822754, + "kl": 0.533203125, + "learning_rate": 2.0528000059645995e-07, + "loss": 0.1209, + "reward": -2.3888888359069824, + "reward_std": 0.3082153648138046, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.6111111342906952, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 3299.583251953125, + "epoch": 0.20651242502142245, + "grad_norm": 2.7216079235076904, + "kl": 0.529296875, + "learning_rate": 2.0193778326971628e-07, + "loss": 0.089, + "reward": -2.4722222089767456, + "reward_std": 0.24017397314310074, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.5277777910232544, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 3405.8055419921875, + "epoch": 0.207369323050557, + "grad_norm": 2.5555975437164307, + "kl": 0.71875, + "learning_rate": 1.986426879955034e-07, + "loss": 0.1951, + "reward": -2.0925925374031067, + "reward_std": 0.9167544841766357, + "rewards/confidence_reward": -2.5370370149612427, + "rewards/format_reward": 0.4444444477558136, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 3524.3055419921875, + "epoch": 0.20822622107969152, + "grad_norm": 1.0115513801574707, + "kl": 0.794921875, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.1157, + "reward": -2.666666626930237, + "reward_std": 0.30821534991264343, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3333333358168602, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 3453.833251953125, + "epoch": 0.20908311910882604, + "grad_norm": 2.69391131401062, + "kl": 0.6103515625, + "learning_rate": 1.9219564157731844e-07, + "loss": 0.1843, + "reward": -2.5277777910232544, + "reward_std": 0.5088144540786743, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.472222238779068, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 3176.9444580078125, + "epoch": 0.2099400171379606, + "grad_norm": 3.2699921131134033, + "kl": 0.609375, + "learning_rate": 1.8904456326023027e-07, + "loss": 0.1116, + "reward": -1.9722222685813904, + "reward_std": 0.5316601991653442, + "rewards/confidence_reward": -2.5, + "rewards/format_reward": 0.5277777910232544, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 2784.6944580078125, + "epoch": 0.21079691516709512, + "grad_norm": 1.8682541847229004, + "kl": 0.56640625, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.1448, + "reward": -1.6203704178333282, + "reward_std": 0.6126016080379486, + "rewards/confidence_reward": -2.2037037014961243, + "rewards/format_reward": 0.5833333283662796, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 3728.638916015625, + "epoch": 0.21165381319622964, + "grad_norm": 3.5428807735443115, + "kl": 0.78515625, + "learning_rate": 1.8288942938012267e-07, + "loss": 0.0706, + "reward": -2.490740656852722, + "reward_std": 0.46697860956192017, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.4166666716337204, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 3008.138916015625, + "epoch": 0.2125107112253642, + "grad_norm": 1.3764863014221191, + "kl": 0.57421875, + "learning_rate": 1.7988620712370195e-07, + "loss": 0.0807, + "reward": -1.9478338956832886, + "reward_std": 0.7096292227506638, + "rewards/confidence_reward": -2.4478338956832886, + "rewards/format_reward": 0.5, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 3118.1944580078125, + "epoch": 0.2133676092544987, + "grad_norm": 3.4032788276672363, + "kl": 0.5830078125, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.1638, + "reward": -2.0277777314186096, + "reward_std": 0.8689809292554855, + "rewards/confidence_reward": -2.666666626930237, + "rewards/format_reward": 0.6388888955116272, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 3191.4166259765625, + "epoch": 0.21422450728363324, + "grad_norm": 1.8875763416290283, + "kl": 0.689453125, + "learning_rate": 1.7403048486417868e-07, + "loss": 0.1055, + "reward": -1.9545510709285736, + "reward_std": 0.48649683594703674, + "rewards/confidence_reward": -2.26010662317276, + "rewards/format_reward": 0.3055555522441864, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 2924.638916015625, + "epoch": 0.2150814053127678, + "grad_norm": 2.471829414367676, + "kl": 0.5673828125, + "learning_rate": 1.711787776321341e-07, + "loss": 0.1465, + "reward": -2.101851761341095, + "reward_std": 0.6704803705215454, + "rewards/confidence_reward": -2.7407407760620117, + "rewards/format_reward": 0.638888880610466, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 3481.8056640625, + "epoch": 0.2159383033419023, + "grad_norm": 1.7675886154174805, + "kl": 0.8046875, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.1342, + "reward": -2.314814805984497, + "reward_std": 0.5083017498254776, + "rewards/confidence_reward": -2.7037036418914795, + "rewards/format_reward": 0.3888888955116272, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 3527.4166259765625, + "epoch": 0.21679520137103683, + "grad_norm": 2.999549627304077, + "kl": 0.759765625, + "learning_rate": 1.6562960128876353e-07, + "loss": 0.0954, + "reward": -2.5185184478759766, + "reward_std": 0.3628873750567436, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.3888888955116272, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 3826.916748046875, + "epoch": 0.21765209940017138, + "grad_norm": 4.181079864501953, + "kl": 0.927734375, + "learning_rate": 1.6293288344708566e-07, + "loss": 0.0935, + "reward": -2.6944445371627808, + "reward_std": 0.3134361654520035, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.305555559694767, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 3448.47216796875, + "epoch": 0.2185089974293059, + "grad_norm": 1.8785117864608765, + "kl": 0.7265625, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.0921, + "reward": -2.4175140261650085, + "reward_std": 0.5672558695077896, + "rewards/confidence_reward": -2.83418071269989, + "rewards/format_reward": 0.4166666604578495, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 3920.5833740234375, + "epoch": 0.21936589545844046, + "grad_norm": 7.422764778137207, + "kl": 0.84375, + "learning_rate": 1.5769701383645698e-07, + "loss": 0.034, + "reward": -2.7777777910232544, + "reward_std": 0.1360827535390854, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.2222222276031971, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 3276.9444580078125, + "epoch": 0.22022279348757498, + "grad_norm": 6.746924877166748, + "kl": 0.640625, + "learning_rate": 1.551585709204381e-07, + "loss": 0.1587, + "reward": -1.9537036418914795, + "reward_std": 0.6170237511396408, + "rewards/confidence_reward": -2.425925850868225, + "rewards/format_reward": 0.4722222238779068, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 2808.27783203125, + "epoch": 0.2210796915167095, + "grad_norm": 9.45202350616455, + "kl": 0.42578125, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.224, + "reward": -1.5564029514789581, + "reward_std": 0.6021379753947258, + "rewards/confidence_reward": -2.0564029812812805, + "rewards/format_reward": 0.5, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.3611145019531, + "epoch": 0.22193658954584405, + "grad_norm": 1.4079711437225342, + "kl": 0.208984375, + "learning_rate": 1.5024238714314825e-07, + "loss": 0.0526, + "reward": -0.9999999701976776, + "reward_std": 1.1379348039627075, + "rewards/confidence_reward": -2.0000001192092896, + "rewards/format_reward": 1.0, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 3447.47216796875, + "epoch": 0.22279348757497858, + "grad_norm": 2.1422104835510254, + "kl": 0.6162109375, + "learning_rate": 1.4786531185446452e-07, + "loss": 0.1618, + "reward": -2.4018328189849854, + "reward_std": 0.5933989137411118, + "rewards/confidence_reward": -2.7907216548919678, + "rewards/format_reward": 0.3888889029622078, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 3852.666748046875, + "epoch": 0.2236503856041131, + "grad_norm": 2.077338457107544, + "kl": 0.70703125, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.1074, + "reward": -2.805555582046509, + "reward_std": 0.3134361580014229, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.1944444514811039, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 3562.7222900390625, + "epoch": 0.22450728363324765, + "grad_norm": 0.7478838562965393, + "kl": 0.5234375, + "learning_rate": 1.432748035231658e-07, + "loss": 0.0809, + "reward": -2.546296238899231, + "reward_std": 0.4309287667274475, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.3611111082136631, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 3409.388916015625, + "epoch": 0.22536418166238217, + "grad_norm": 1.5875946283340454, + "kl": 0.560546875, + "learning_rate": 1.4106199196189608e-07, + "loss": 0.0874, + "reward": -2.5, + "reward_std": 0.44037309288978577, + "rewards/confidence_reward": -2.722222328186035, + "rewards/format_reward": 0.2222222276031971, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 3616.8055419921875, + "epoch": 0.2262210796915167, + "grad_norm": 4.355445384979248, + "kl": 0.47265625, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.1239, + "reward": -2.6111111640930176, + "reward_std": 0.3814775198698044, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3888888955116272, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 3070.0833740234375, + "epoch": 0.22707797772065125, + "grad_norm": 1.2011481523513794, + "kl": 0.4541015625, + "learning_rate": 1.3680275190387675e-07, + "loss": 0.1067, + "reward": -2.2592591047286987, + "reward_std": 0.46683208644390106, + "rewards/confidence_reward": -2.7592592239379883, + "rewards/format_reward": 0.5000000149011612, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 3636.3333740234375, + "epoch": 0.22793487574978577, + "grad_norm": 2.6138198375701904, + "kl": 0.5703125, + "learning_rate": 1.3475690004005097e-07, + "loss": 0.1152, + "reward": -2.6111111640930176, + "reward_std": 0.29541122913360596, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3888888955116272, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 2078.4444580078125, + "epoch": 0.22879177377892032, + "grad_norm": 1.8686110973358154, + "kl": 0.294921875, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0517, + "reward": -1.2870370149612427, + "reward_std": 1.2355431914329529, + "rewards/confidence_reward": -2.203703761100769, + "rewards/format_reward": 0.9166666567325592, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 3596.3055419921875, + "epoch": 0.22964867180805484, + "grad_norm": 5.496092319488525, + "kl": 0.66796875, + "learning_rate": 1.308341174832359e-07, + "loss": 0.023, + "reward": -2.75, + "reward_std": 0.2901904433965683, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.2500000074505806, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 2841.416748046875, + "epoch": 0.23050556983718937, + "grad_norm": 8.94375991821289, + "kl": 0.4580078125, + "learning_rate": 1.2895771787221088e-07, + "loss": 0.2275, + "reward": -1.8611111044883728, + "reward_std": 1.1322061419487, + "rewards/confidence_reward": -2.3888890743255615, + "rewards/format_reward": 0.5277777910232544, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 2849.9722900390625, + "epoch": 0.23136246786632392, + "grad_norm": 3.3087215423583984, + "kl": 0.439453125, + "learning_rate": 1.2713832064634125e-07, + "loss": 0.1111, + "reward": -1.6481481790542603, + "reward_std": 0.5850364714860916, + "rewards/confidence_reward": -2.203703761100769, + "rewards/format_reward": 0.5555555671453476, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 3841.3055419921875, + "epoch": 0.23221936589545844, + "grad_norm": 4.227456569671631, + "kl": 0.708984375, + "learning_rate": 1.2537617212288742e-07, + "loss": 0.0629, + "reward": -2.6666667461395264, + "reward_std": 0.2453947588801384, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3333333432674408, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 2746.083251953125, + "epoch": 0.23307626392459296, + "grad_norm": 10.011512756347656, + "kl": 0.48046875, + "learning_rate": 1.2367151086855187e-07, + "loss": 0.2351, + "reward": -2.0833332538604736, + "reward_std": 0.8133133947849274, + "rewards/confidence_reward": -2.6666667461395264, + "rewards/format_reward": 0.5833333432674408, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 3753.7501220703125, + "epoch": 0.23393316195372751, + "grad_norm": 2.311757802963257, + "kl": 0.7109375, + "learning_rate": 1.220245676671809e-07, + "loss": 0.0975, + "reward": -2.6111111640930176, + "reward_std": 0.41752736270427704, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3888889029622078, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 3697.3333740234375, + "epoch": 0.23479005998286204, + "grad_norm": 1.5095380544662476, + "kl": 0.76953125, + "learning_rate": 1.2043556548852063e-07, + "loss": 0.1038, + "reward": -2.666666626930237, + "reward_std": 0.245394766330719, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.3333333432674408, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 3706.2222900390625, + "epoch": 0.23564695801199656, + "grad_norm": 1.542810320854187, + "kl": 0.72265625, + "learning_rate": 1.1890471945802999e-07, + "loss": 0.1013, + "reward": -2.694444417953491, + "reward_std": 0.20412413030862808, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.30555556900799274, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 2976.6944580078125, + "epoch": 0.2365038560411311, + "grad_norm": 5.164428234100342, + "kl": 0.4794921875, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.1522, + "reward": -1.759259283542633, + "reward_std": 0.9753341376781464, + "rewards/confidence_reward": -2.425925850868225, + "rewards/format_reward": 0.6666666567325592, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 3707.5555419921875, + "epoch": 0.23736075407026563, + "grad_norm": 4.900428295135498, + "kl": 0.76171875, + "learning_rate": 1.160183169482775e-07, + "loss": 0.0093, + "reward": -2.6666667461395264, + "reward_std": 0.2221490666270256, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.2777777835726738, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 3226.5, + "epoch": 0.23821765209940018, + "grad_norm": 2.8093297481536865, + "kl": 0.66796875, + "learning_rate": 1.1466315124171128e-07, + "loss": 0.0633, + "reward": -2.314814805984497, + "reward_std": 0.5322824418544769, + "rewards/confidence_reward": -2.8148149251937866, + "rewards/format_reward": 0.5, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 2714.388916015625, + "epoch": 0.2390745501285347, + "grad_norm": 12.48595142364502, + "kl": 0.61328125, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.2795, + "reward": -1.5740740895271301, + "reward_std": 0.9820237159729004, + "rewards/confidence_reward": -2.2407408952713013, + "rewards/format_reward": 0.6666666716337204, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 2800.166748046875, + "epoch": 0.23993144815766923, + "grad_norm": 1.2126076221466064, + "kl": 0.53125, + "learning_rate": 1.1212980823907929e-07, + "loss": 0.0903, + "reward": -1.9259259104728699, + "reward_std": 0.8028348684310913, + "rewards/confidence_reward": -2.425926089286804, + "rewards/format_reward": 0.5000000149011612, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 3716.638916015625, + "epoch": 0.24078834618680378, + "grad_norm": 2.4237496852874756, + "kl": 0.677734375, + "learning_rate": 1.1095197391710362e-07, + "loss": 0.0919, + "reward": -2.7222222089767456, + "reward_std": 0.3419027030467987, + "rewards/confidence_reward": -2.944444417953491, + "rewards/format_reward": 0.22222222574055195, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 3308.6944580078125, + "epoch": 0.2416452442159383, + "grad_norm": 1.1492081880569458, + "kl": 0.5546875, + "learning_rate": 1.0983357966978745e-07, + "loss": 0.0932, + "reward": -2.2592592239379883, + "reward_std": 0.5841350704431534, + "rewards/confidence_reward": -2.759259343147278, + "rewards/format_reward": 0.5000000149011612, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 3500.416748046875, + "epoch": 0.24250214224507283, + "grad_norm": 2.292973756790161, + "kl": 0.67578125, + "learning_rate": 1.0877477690980931e-07, + "loss": 0.1152, + "reward": -2.4629629850387573, + "reward_std": 0.44895368814468384, + "rewards/confidence_reward": -2.9074074029922485, + "rewards/format_reward": 0.4444444552063942, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 3665.1666259765625, + "epoch": 0.24335904027420738, + "grad_norm": 3.3193001747131348, + "kl": 0.6337890625, + "learning_rate": 1.0777570898211405e-07, + "loss": 0.0586, + "reward": -2.5370370149612427, + "reward_std": 0.41200246661901474, + "rewards/confidence_reward": -2.7592592239379883, + "rewards/format_reward": 0.22222222574055195, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 3264.75, + "epoch": 0.2442159383033419, + "grad_norm": 2.722202777862549, + "kl": 0.541015625, + "learning_rate": 1.068365111445064e-07, + "loss": 0.1034, + "reward": -2.185185134410858, + "reward_std": 0.6440667659044266, + "rewards/confidence_reward": -2.7962963581085205, + "rewards/format_reward": 0.6111111044883728, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 3782.638916015625, + "epoch": 0.24507283633247642, + "grad_norm": 1.6381499767303467, + "kl": 0.720703125, + "learning_rate": 1.0595731054933934e-07, + "loss": 0.0854, + "reward": -2.75, + "reward_std": 0.09128709882497787, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.25, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 3756.0, + "epoch": 0.24592973436161097, + "grad_norm": 1.275122880935669, + "kl": 0.6171875, + "learning_rate": 1.0513822622629978e-07, + "loss": 0.0672, + "reward": -2.694444417953491, + "reward_std": 0.1773533970117569, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.305555559694767, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 3949.111083984375, + "epoch": 0.2467866323907455, + "grad_norm": 2.43125057220459, + "kl": 0.619140625, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0689, + "reward": -2.8888888359069824, + "reward_std": 0.15932847559452057, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.11111111380159855, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 2842.9722900390625, + "epoch": 0.24764353041988005, + "grad_norm": 1.0015897750854492, + "kl": 0.3935546875, + "learning_rate": 1.0368084180643224e-07, + "loss": 0.0667, + "reward": -1.8897362351417542, + "reward_std": 0.7023327946662903, + "rewards/confidence_reward": -2.500847339630127, + "rewards/format_reward": 0.6111111342906952, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 3495.5833740234375, + "epoch": 0.24850042844901457, + "grad_norm": 3.5078701972961426, + "kl": 0.6806640625, + "learning_rate": 1.0304273901612565e-07, + "loss": 0.0511, + "reward": -2.3055556416511536, + "reward_std": 0.5096173286437988, + "rewards/confidence_reward": -2.6111111640930176, + "rewards/format_reward": 0.305555559694767, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 3017.638916015625, + "epoch": 0.2493573264781491, + "grad_norm": 5.197386264801025, + "kl": 0.466796875, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.1364, + "reward": -1.9907406568527222, + "reward_std": 0.7819060683250427, + "rewards/confidence_reward": -2.518518567085266, + "rewards/format_reward": 0.5277777910232544, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 3224.52783203125, + "epoch": 0.25021422450728364, + "grad_norm": 3.3562777042388916, + "kl": 0.552734375, + "learning_rate": 1.0194814420758804e-07, + "loss": 0.1602, + "reward": -2.0740740299224854, + "reward_std": 0.3628873825073242, + "rewards/confidence_reward": -2.5740740299224854, + "rewards/format_reward": 0.5, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 3234.861083984375, + "epoch": 0.25107112253641817, + "grad_norm": 3.984017848968506, + "kl": 0.572265625, + "learning_rate": 1.0149180037997228e-07, + "loss": 0.1834, + "reward": -2.114795684814453, + "reward_std": 0.7361937463283539, + "rewards/confidence_reward": -2.5870180130004883, + "rewards/format_reward": 0.472222238779068, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 3694.6666259765625, + "epoch": 0.2519280205655527, + "grad_norm": 1.1605571508407593, + "kl": 0.556640625, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.1052, + "reward": -2.6574074029922485, + "reward_std": 0.6070278584957123, + "rewards/confidence_reward": -2.8518518209457397, + "rewards/format_reward": 0.1944444514811039, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 3409.02783203125, + "epoch": 0.2527849185946872, + "grad_norm": 2.5758039951324463, + "kl": 0.54296875, + "learning_rate": 1.0076132877792932e-07, + "loss": 0.1731, + "reward": -2.5277777910232544, + "reward_std": 0.3936592862010002, + "rewards/confidence_reward": -2.833333373069763, + "rewards/format_reward": 0.3055555559694767, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 3830.166748046875, + "epoch": 0.2536418166238218, + "grad_norm": 2.108941078186035, + "kl": 0.630859375, + "learning_rate": 1.0048729989766394e-07, + "loss": 0.0874, + "reward": -2.805555582046509, + "reward_std": 0.20412414520978928, + "rewards/confidence_reward": -3.0, + "rewards/format_reward": 0.1944444514811039, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 3393.3333740234375, + "epoch": 0.2544987146529563, + "grad_norm": 3.105771064758301, + "kl": 0.484375, + "learning_rate": 1.002741278414069e-07, + "loss": 0.1061, + "reward": -2.240740716457367, + "reward_std": 0.654969185590744, + "rewards/confidence_reward": -2.796296238899231, + "rewards/format_reward": 0.5555555671453476, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 2507.611114501953, + "epoch": 0.25535561268209084, + "grad_norm": 9.724431991577148, + "kl": 0.693359375, + "learning_rate": 1.0012184146924223e-07, + "loss": 0.0067, + "reward": -2.000847339630127, + "reward_std": 0.5755332708358765, + "rewards/confidence_reward": -2.500847339630127, + "rewards/format_reward": 0.5, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 3396.9444580078125, + "epoch": 0.25621251071122536, + "grad_norm": 5.069945335388184, + "kl": 0.4638671875, + "learning_rate": 1.0003046139830701e-07, + "loss": 0.1936, + "reward": -2.2314815521240234, + "reward_std": 0.8970945179462433, + "rewards/confidence_reward": -2.759259343147278, + "rewards/format_reward": 0.5277777910232544, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 2547.6945190429688, + "epoch": 0.2570694087403599, + "grad_norm": 6.607841491699219, + "kl": 0.3818359375, + "learning_rate": 1e-07, + "loss": 0.1813, + "reward": -1.5749214887619019, + "reward_std": 0.6023647785186768, + "rewards/confidence_reward": -2.1860325932502747, + "rewards/format_reward": 0.6111111044883728, + "step": 300 + }, + { + "epoch": 0.2570694087403599, + "step": 300, + "total_flos": 0.0, + "train_loss": 0.09362538470280318, + "train_runtime": 37056.8122, + "train_samples_per_second": 0.291, + "train_steps_per_second": 0.008 + } + ], + "logging_steps": 1, + "max_steps": 300, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..65a46d6 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5293f42fb8bc5dfe8fde7c52810d2ae823576caf8e3aa14b0d71ad151cec33d +size 8376